diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,154022 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 9.999366420274551, + "global_step": 11830, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 11.403590202331543, + "epoch": 0.0, + "learning_rate": 4.2265426880811495e-08, + "loss": 10.0071, + "step": 1, + "task_loss": 4.893718242645264 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 12.050601959228516, + "epoch": 0.0, + "learning_rate": 8.453085376162299e-08, + "loss": 11.6804, + "step": 2, + "task_loss": 4.735933303833008 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 11.589404106140137, + "epoch": 0.0, + "learning_rate": 1.267962806424345e-07, + "loss": 10.5271, + "step": 3, + "task_loss": 4.754335880279541 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 11.161865234375, + "epoch": 0.0, + "learning_rate": 1.6906170752324598e-07, + "loss": 10.5263, + "step": 4, + "task_loss": 4.63836145401001 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 13.924878120422363, + "epoch": 0.0, + "learning_rate": 2.113271344040575e-07, + "loss": 10.962, + "step": 5, + "task_loss": 4.719663143157959 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 12.134042739868164, + "epoch": 0.01, + "learning_rate": 2.53592561284869e-07, + "loss": 10.352, + "step": 6, + "task_loss": 4.635924816131592 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 11.69445514678955, + "epoch": 0.01, + "learning_rate": 2.958579881656805e-07, + "loss": 10.8653, + "step": 7, + "task_loss": 4.6902756690979 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 11.52145004272461, + "epoch": 0.01, + "learning_rate": 3.3812341504649196e-07, + "loss": 10.892, + "step": 8, + "task_loss": 4.786701679229736 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 12.859132766723633, + "epoch": 0.01, + "learning_rate": 3.803888419273035e-07, + "loss": 9.9271, + "step": 9, + "task_loss": 4.666134357452393 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 10.892845153808594, + "epoch": 0.01, + "learning_rate": 4.22654268808115e-07, + "loss": 11.1902, + "step": 10, + "task_loss": 4.827131271362305 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 12.09437370300293, + "epoch": 0.01, + "learning_rate": 4.649196956889265e-07, + "loss": 10.3811, + "step": 11, + "task_loss": 4.565781593322754 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 12.831682205200195, + "epoch": 0.01, + "learning_rate": 5.07185122569738e-07, + "loss": 11.1487, + "step": 12, + "task_loss": 4.640714168548584 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 11.277006149291992, + "epoch": 0.01, + "learning_rate": 5.494505494505495e-07, + "loss": 10.7189, + "step": 13, + "task_loss": 4.608959674835205 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 12.255281448364258, + "epoch": 0.01, + "learning_rate": 5.91715976331361e-07, + "loss": 10.9164, + "step": 14, + "task_loss": 4.694131374359131 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 12.481856346130371, + "epoch": 0.01, + "learning_rate": 6.339814032121725e-07, + "loss": 10.8891, + "step": 15, + "task_loss": 4.608437538146973 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 11.204913139343262, + "epoch": 0.01, + "learning_rate": 6.762468300929839e-07, + "loss": 10.847, + "step": 16, + "task_loss": 4.60659122467041 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 9.41554069519043, + "epoch": 0.01, + "learning_rate": 7.185122569737954e-07, + "loss": 10.6778, + "step": 17, + "task_loss": 4.496872901916504 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 11.610633850097656, + "epoch": 0.02, + "learning_rate": 7.60777683854607e-07, + "loss": 10.4457, + "step": 18, + "task_loss": 4.7978034019470215 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 10.160855293273926, + "epoch": 0.02, + "learning_rate": 8.030431107354184e-07, + "loss": 10.2446, + "step": 19, + "task_loss": 4.690127849578857 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 11.536073684692383, + "epoch": 0.02, + "learning_rate": 8.4530853761623e-07, + "loss": 10.7825, + "step": 20, + "task_loss": 4.7647786140441895 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 12.957894325256348, + "epoch": 0.02, + "learning_rate": 8.875739644970415e-07, + "loss": 10.986, + "step": 21, + "task_loss": 4.757566452026367 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 11.082231521606445, + "epoch": 0.02, + "learning_rate": 9.29839391377853e-07, + "loss": 10.8411, + "step": 22, + "task_loss": 4.698436737060547 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 12.1494140625, + "epoch": 0.02, + "learning_rate": 9.721048182586645e-07, + "loss": 10.9545, + "step": 23, + "task_loss": 4.758025646209717 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 12.298805236816406, + "epoch": 0.02, + "learning_rate": 1.014370245139476e-06, + "loss": 11.4158, + "step": 24, + "task_loss": 4.608049392700195 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 9.7218017578125, + "epoch": 0.02, + "learning_rate": 1.0566356720202875e-06, + "loss": 10.7729, + "step": 25, + "task_loss": 4.495795249938965 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 12.641130447387695, + "epoch": 0.02, + "learning_rate": 1.098901098901099e-06, + "loss": 11.0989, + "step": 26, + "task_loss": 4.5820488929748535 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 11.282276153564453, + "epoch": 0.02, + "learning_rate": 1.1411665257819105e-06, + "loss": 10.9302, + "step": 27, + "task_loss": 4.623043537139893 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 12.058012008666992, + "epoch": 0.02, + "learning_rate": 1.183431952662722e-06, + "loss": 11.944, + "step": 28, + "task_loss": 4.556249618530273 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 11.540451049804688, + "epoch": 0.02, + "learning_rate": 1.2256973795435333e-06, + "loss": 10.6407, + "step": 29, + "task_loss": 4.739696025848389 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 12.025243759155273, + "epoch": 0.03, + "learning_rate": 1.267962806424345e-06, + "loss": 11.1553, + "step": 30, + "task_loss": 4.695968151092529 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 10.77232837677002, + "epoch": 0.03, + "learning_rate": 1.3102282333051563e-06, + "loss": 10.7184, + "step": 31, + "task_loss": 4.6045002937316895 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 11.93844985961914, + "epoch": 0.03, + "learning_rate": 1.3524936601859678e-06, + "loss": 10.9748, + "step": 32, + "task_loss": 4.716665267944336 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 12.5638427734375, + "epoch": 0.03, + "learning_rate": 1.3947590870667795e-06, + "loss": 10.4731, + "step": 33, + "task_loss": 4.525957107543945 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 9.45874309539795, + "epoch": 0.03, + "learning_rate": 1.4370245139475908e-06, + "loss": 10.0613, + "step": 34, + "task_loss": 4.668154716491699 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 10.873498916625977, + "epoch": 0.03, + "learning_rate": 1.4792899408284024e-06, + "loss": 10.7943, + "step": 35, + "task_loss": 4.735232830047607 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 14.68942642211914, + "epoch": 0.03, + "learning_rate": 1.521555367709214e-06, + "loss": 11.6994, + "step": 36, + "task_loss": 4.761033058166504 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 10.104684829711914, + "epoch": 0.03, + "learning_rate": 1.5638207945900256e-06, + "loss": 10.2657, + "step": 37, + "task_loss": 4.6487507820129395 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 10.520551681518555, + "epoch": 0.03, + "learning_rate": 1.6060862214708369e-06, + "loss": 11.6035, + "step": 38, + "task_loss": 4.661336898803711 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 12.183745384216309, + "epoch": 0.03, + "learning_rate": 1.6483516483516484e-06, + "loss": 11.5555, + "step": 39, + "task_loss": 4.7185587882995605 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 11.789106369018555, + "epoch": 0.03, + "learning_rate": 1.69061707523246e-06, + "loss": 10.302, + "step": 40, + "task_loss": 4.583945274353027 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 10.97212028503418, + "epoch": 0.03, + "learning_rate": 1.7328825021132714e-06, + "loss": 10.2057, + "step": 41, + "task_loss": 4.640735149383545 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 11.867840766906738, + "epoch": 0.04, + "learning_rate": 1.775147928994083e-06, + "loss": 10.988, + "step": 42, + "task_loss": 4.705913543701172 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 10.908050537109375, + "epoch": 0.04, + "learning_rate": 1.8174133558748946e-06, + "loss": 10.9816, + "step": 43, + "task_loss": 4.523036479949951 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 11.694483757019043, + "epoch": 0.04, + "learning_rate": 1.859678782755706e-06, + "loss": 10.1506, + "step": 44, + "task_loss": 4.536032199859619 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 12.424906730651855, + "epoch": 0.04, + "learning_rate": 1.9019442096365174e-06, + "loss": 11.1176, + "step": 45, + "task_loss": 4.643701553344727 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 11.884790420532227, + "epoch": 0.04, + "learning_rate": 1.944209636517329e-06, + "loss": 11.319, + "step": 46, + "task_loss": 4.609011650085449 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 9.962676048278809, + "epoch": 0.04, + "learning_rate": 1.9864750633981404e-06, + "loss": 10.0115, + "step": 47, + "task_loss": 4.765456199645996 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 12.796348571777344, + "epoch": 0.04, + "learning_rate": 2.028740490278952e-06, + "loss": 11.2703, + "step": 48, + "task_loss": 4.579859733581543 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 11.289663314819336, + "epoch": 0.04, + "learning_rate": 2.0710059171597635e-06, + "loss": 9.8133, + "step": 49, + "task_loss": 4.583054542541504 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 11.093294143676758, + "epoch": 0.04, + "learning_rate": 2.113271344040575e-06, + "loss": 10.9728, + "step": 50, + "task_loss": 4.500999927520752 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 10.974398612976074, + "epoch": 0.04, + "learning_rate": 2.1555367709213865e-06, + "loss": 11.1329, + "step": 51, + "task_loss": 4.763656139373779 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 11.954800605773926, + "epoch": 0.04, + "learning_rate": 2.197802197802198e-06, + "loss": 10.6315, + "step": 52, + "task_loss": 4.7645440101623535 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 13.320015907287598, + "epoch": 0.04, + "learning_rate": 2.2400676246830095e-06, + "loss": 10.9405, + "step": 53, + "task_loss": 4.646806240081787 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 11.694287300109863, + "epoch": 0.05, + "learning_rate": 2.282333051563821e-06, + "loss": 10.8696, + "step": 54, + "task_loss": 4.7642822265625 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 10.622802734375, + "epoch": 0.05, + "learning_rate": 2.324598478444632e-06, + "loss": 10.468, + "step": 55, + "task_loss": 4.651461601257324 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 11.191650390625, + "epoch": 0.05, + "learning_rate": 2.366863905325444e-06, + "loss": 11.1634, + "step": 56, + "task_loss": 4.511422634124756 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 11.475743293762207, + "epoch": 0.05, + "learning_rate": 2.4091293322062555e-06, + "loss": 10.7596, + "step": 57, + "task_loss": 4.732883930206299 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 11.170340538024902, + "epoch": 0.05, + "learning_rate": 2.4513947590870666e-06, + "loss": 10.5071, + "step": 58, + "task_loss": 4.723390102386475 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 12.606315612792969, + "epoch": 0.05, + "learning_rate": 2.4936601859678785e-06, + "loss": 10.9258, + "step": 59, + "task_loss": 4.5483927726745605 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 12.338319778442383, + "epoch": 0.05, + "learning_rate": 2.53592561284869e-06, + "loss": 11.211, + "step": 60, + "task_loss": 4.594723224639893 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 9.50085163116455, + "epoch": 0.05, + "learning_rate": 2.578191039729501e-06, + "loss": 10.1077, + "step": 61, + "task_loss": 4.477964878082275 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 10.907859802246094, + "epoch": 0.05, + "learning_rate": 2.6204564666103126e-06, + "loss": 10.416, + "step": 62, + "task_loss": 4.755913257598877 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 10.288369178771973, + "epoch": 0.05, + "learning_rate": 2.6627218934911246e-06, + "loss": 10.6632, + "step": 63, + "task_loss": 4.57666015625 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 12.502639770507812, + "epoch": 0.05, + "learning_rate": 2.7049873203719357e-06, + "loss": 10.5185, + "step": 64, + "task_loss": 4.579930782318115 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 13.63343620300293, + "epoch": 0.05, + "learning_rate": 2.747252747252747e-06, + "loss": 11.4454, + "step": 65, + "task_loss": 4.677177906036377 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 11.630847930908203, + "epoch": 0.06, + "learning_rate": 2.789518174133559e-06, + "loss": 11.0318, + "step": 66, + "task_loss": 4.614037036895752 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 9.591766357421875, + "epoch": 0.06, + "learning_rate": 2.83178360101437e-06, + "loss": 10.7626, + "step": 67, + "task_loss": 4.582085609436035 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 12.07567024230957, + "epoch": 0.06, + "learning_rate": 2.8740490278951817e-06, + "loss": 10.6236, + "step": 68, + "task_loss": 4.833033084869385 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 12.121071815490723, + "epoch": 0.06, + "learning_rate": 2.9163144547759936e-06, + "loss": 11.1782, + "step": 69, + "task_loss": 4.737482070922852 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 10.720462799072266, + "epoch": 0.06, + "learning_rate": 2.9585798816568047e-06, + "loss": 10.637, + "step": 70, + "task_loss": 4.638803958892822 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 11.318552017211914, + "epoch": 0.06, + "learning_rate": 3.0008453085376162e-06, + "loss": 10.0053, + "step": 71, + "task_loss": 4.742284774780273 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 10.845678329467773, + "epoch": 0.06, + "learning_rate": 3.043110735418428e-06, + "loss": 10.2372, + "step": 72, + "task_loss": 4.62070369720459 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 12.64316177368164, + "epoch": 0.06, + "learning_rate": 3.0853761622992392e-06, + "loss": 11.5501, + "step": 73, + "task_loss": 4.555132865905762 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 11.010642051696777, + "epoch": 0.06, + "learning_rate": 3.127641589180051e-06, + "loss": 10.2998, + "step": 74, + "task_loss": 4.748885154724121 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 11.951274871826172, + "epoch": 0.06, + "learning_rate": 3.1699070160608622e-06, + "loss": 10.6523, + "step": 75, + "task_loss": 4.4831132888793945 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 12.449923515319824, + "epoch": 0.06, + "learning_rate": 3.2121724429416738e-06, + "loss": 10.7146, + "step": 76, + "task_loss": 4.62700891494751 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 13.358880996704102, + "epoch": 0.07, + "learning_rate": 3.2544378698224853e-06, + "loss": 11.1746, + "step": 77, + "task_loss": 4.609684467315674 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 11.9044771194458, + "epoch": 0.07, + "learning_rate": 3.2967032967032968e-06, + "loss": 11.0865, + "step": 78, + "task_loss": 4.658127784729004 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 11.360875129699707, + "epoch": 0.07, + "learning_rate": 3.3389687235841087e-06, + "loss": 10.2149, + "step": 79, + "task_loss": 4.5647125244140625 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 12.531564712524414, + "epoch": 0.07, + "learning_rate": 3.38123415046492e-06, + "loss": 11.6188, + "step": 80, + "task_loss": 4.519094944000244 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 8.807174682617188, + "epoch": 0.07, + "learning_rate": 3.4234995773457313e-06, + "loss": 10.3161, + "step": 81, + "task_loss": 4.749982833862305 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 11.913570404052734, + "epoch": 0.07, + "learning_rate": 3.465765004226543e-06, + "loss": 11.4496, + "step": 82, + "task_loss": 4.597235202789307 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 11.739767074584961, + "epoch": 0.07, + "learning_rate": 3.5080304311073543e-06, + "loss": 12.0094, + "step": 83, + "task_loss": 4.527524948120117 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 11.401664733886719, + "epoch": 0.07, + "learning_rate": 3.550295857988166e-06, + "loss": 11.7643, + "step": 84, + "task_loss": 4.5950927734375 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 11.095552444458008, + "epoch": 0.07, + "learning_rate": 3.5925612848689777e-06, + "loss": 11.1968, + "step": 85, + "task_loss": 4.579239845275879 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 12.871604919433594, + "epoch": 0.07, + "learning_rate": 3.6348267117497893e-06, + "loss": 10.4665, + "step": 86, + "task_loss": 4.7465128898620605 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 9.213700294494629, + "epoch": 0.07, + "learning_rate": 3.6770921386306e-06, + "loss": 10.4609, + "step": 87, + "task_loss": 4.574832916259766 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 10.151535034179688, + "epoch": 0.07, + "learning_rate": 3.719357565511412e-06, + "loss": 10.6299, + "step": 88, + "task_loss": 4.6177802085876465 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 11.927519798278809, + "epoch": 0.08, + "learning_rate": 3.7616229923922234e-06, + "loss": 11.6768, + "step": 89, + "task_loss": 4.618904113769531 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 12.138202667236328, + "epoch": 0.08, + "learning_rate": 3.803888419273035e-06, + "loss": 10.4512, + "step": 90, + "task_loss": 4.729623794555664 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 11.830541610717773, + "epoch": 0.08, + "learning_rate": 3.846153846153847e-06, + "loss": 10.7422, + "step": 91, + "task_loss": 4.632906913757324 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 12.520606994628906, + "epoch": 0.08, + "learning_rate": 3.888419273034658e-06, + "loss": 10.2491, + "step": 92, + "task_loss": 4.691428184509277 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 9.747093200683594, + "epoch": 0.08, + "learning_rate": 3.930684699915469e-06, + "loss": 10.4616, + "step": 93, + "task_loss": 4.500308513641357 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 13.003189086914062, + "epoch": 0.08, + "learning_rate": 3.972950126796281e-06, + "loss": 11.4123, + "step": 94, + "task_loss": 4.640787124633789 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 9.187047958374023, + "epoch": 0.08, + "learning_rate": 4.015215553677092e-06, + "loss": 10.5413, + "step": 95, + "task_loss": 4.3826751708984375 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 11.307405471801758, + "epoch": 0.08, + "learning_rate": 4.057480980557904e-06, + "loss": 10.7577, + "step": 96, + "task_loss": 4.4358978271484375 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 10.256270408630371, + "epoch": 0.08, + "learning_rate": 4.099746407438716e-06, + "loss": 10.316, + "step": 97, + "task_loss": 4.61489725112915 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 9.962421417236328, + "epoch": 0.08, + "learning_rate": 4.142011834319527e-06, + "loss": 10.7515, + "step": 98, + "task_loss": 4.597455978393555 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 12.505260467529297, + "epoch": 0.08, + "learning_rate": 4.184277261200338e-06, + "loss": 11.2725, + "step": 99, + "task_loss": 4.555180549621582 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 10.828603744506836, + "epoch": 0.08, + "learning_rate": 4.22654268808115e-06, + "loss": 10.475, + "step": 100, + "task_loss": 4.5248332023620605 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 9.866134643554688, + "epoch": 0.09, + "learning_rate": 4.268808114961961e-06, + "loss": 10.3716, + "step": 101, + "task_loss": 4.572544097900391 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 10.741941452026367, + "epoch": 0.09, + "learning_rate": 4.311073541842773e-06, + "loss": 10.1696, + "step": 102, + "task_loss": 4.495937824249268 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 11.36722183227539, + "epoch": 0.09, + "learning_rate": 4.353338968723585e-06, + "loss": 10.0955, + "step": 103, + "task_loss": 4.5355353355407715 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 10.730938911437988, + "epoch": 0.09, + "learning_rate": 4.395604395604396e-06, + "loss": 10.3114, + "step": 104, + "task_loss": 4.594440937042236 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 9.972132682800293, + "epoch": 0.09, + "learning_rate": 4.437869822485207e-06, + "loss": 10.5973, + "step": 105, + "task_loss": 4.579448699951172 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 12.593635559082031, + "epoch": 0.09, + "learning_rate": 4.480135249366019e-06, + "loss": 11.1273, + "step": 106, + "task_loss": 4.613135814666748 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 11.411273956298828, + "epoch": 0.09, + "learning_rate": 4.52240067624683e-06, + "loss": 10.7971, + "step": 107, + "task_loss": 4.618113994598389 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 11.856842994689941, + "epoch": 0.09, + "learning_rate": 4.564666103127642e-06, + "loss": 10.8939, + "step": 108, + "task_loss": 4.570024490356445 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 10.126867294311523, + "epoch": 0.09, + "learning_rate": 4.606931530008454e-06, + "loss": 10.2366, + "step": 109, + "task_loss": 4.447387218475342 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 11.89476490020752, + "epoch": 0.09, + "learning_rate": 4.649196956889264e-06, + "loss": 11.0047, + "step": 110, + "task_loss": 4.466702938079834 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 11.032048225402832, + "epoch": 0.09, + "learning_rate": 4.691462383770076e-06, + "loss": 9.8645, + "step": 111, + "task_loss": 4.414778709411621 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 13.524507522583008, + "epoch": 0.09, + "learning_rate": 4.733727810650888e-06, + "loss": 10.1538, + "step": 112, + "task_loss": 4.66260290145874 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 12.36578369140625, + "epoch": 0.1, + "learning_rate": 4.775993237531699e-06, + "loss": 10.9724, + "step": 113, + "task_loss": 4.530307292938232 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 12.551671981811523, + "epoch": 0.1, + "learning_rate": 4.818258664412511e-06, + "loss": 11.1853, + "step": 114, + "task_loss": 4.478623390197754 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 11.023828506469727, + "epoch": 0.1, + "learning_rate": 4.860524091293322e-06, + "loss": 10.0482, + "step": 115, + "task_loss": 4.4415788650512695 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 13.032001495361328, + "epoch": 0.1, + "learning_rate": 4.902789518174133e-06, + "loss": 10.4372, + "step": 116, + "task_loss": 4.519935607910156 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 10.225025177001953, + "epoch": 0.1, + "learning_rate": 4.945054945054945e-06, + "loss": 10.6497, + "step": 117, + "task_loss": 4.48813533782959 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 11.889101028442383, + "epoch": 0.1, + "learning_rate": 4.987320371935757e-06, + "loss": 10.9735, + "step": 118, + "task_loss": 4.515108108520508 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 12.609251976013184, + "epoch": 0.1, + "learning_rate": 5.029585798816568e-06, + "loss": 11.3165, + "step": 119, + "task_loss": 4.3884382247924805 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 10.96807861328125, + "epoch": 0.1, + "learning_rate": 5.07185122569738e-06, + "loss": 10.5082, + "step": 120, + "task_loss": 4.469944000244141 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 12.186902046203613, + "epoch": 0.1, + "learning_rate": 5.114116652578191e-06, + "loss": 10.4668, + "step": 121, + "task_loss": 4.52056360244751 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 11.834338188171387, + "epoch": 0.1, + "learning_rate": 5.156382079459002e-06, + "loss": 10.3641, + "step": 122, + "task_loss": 4.4478607177734375 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 11.181888580322266, + "epoch": 0.1, + "learning_rate": 5.198647506339814e-06, + "loss": 10.5963, + "step": 123, + "task_loss": 4.391740322113037 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 9.219062805175781, + "epoch": 0.1, + "learning_rate": 5.240912933220625e-06, + "loss": 10.2392, + "step": 124, + "task_loss": 4.6055121421813965 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 10.98564338684082, + "epoch": 0.11, + "learning_rate": 5.283178360101437e-06, + "loss": 10.3353, + "step": 125, + "task_loss": 4.401786804199219 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 10.830859184265137, + "epoch": 0.11, + "learning_rate": 5.325443786982249e-06, + "loss": 10.5764, + "step": 126, + "task_loss": 4.343116283416748 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 12.530597686767578, + "epoch": 0.11, + "learning_rate": 5.36770921386306e-06, + "loss": 10.843, + "step": 127, + "task_loss": 4.556358337402344 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 8.160400390625, + "epoch": 0.11, + "learning_rate": 5.409974640743871e-06, + "loss": 9.9327, + "step": 128, + "task_loss": 4.309567928314209 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 9.638243675231934, + "epoch": 0.11, + "learning_rate": 5.452240067624683e-06, + "loss": 9.9881, + "step": 129, + "task_loss": 4.514079570770264 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 12.311914443969727, + "epoch": 0.11, + "learning_rate": 5.494505494505494e-06, + "loss": 10.6223, + "step": 130, + "task_loss": 4.538150310516357 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 11.595855712890625, + "epoch": 0.11, + "learning_rate": 5.536770921386306e-06, + "loss": 10.9531, + "step": 131, + "task_loss": 4.534841060638428 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 13.290630340576172, + "epoch": 0.11, + "learning_rate": 5.579036348267118e-06, + "loss": 10.7856, + "step": 132, + "task_loss": 4.390566825866699 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 12.233758926391602, + "epoch": 0.11, + "learning_rate": 5.621301775147929e-06, + "loss": 10.3096, + "step": 133, + "task_loss": 4.5218892097473145 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 10.774062156677246, + "epoch": 0.11, + "learning_rate": 5.66356720202874e-06, + "loss": 9.9051, + "step": 134, + "task_loss": 4.409582138061523 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 11.664326667785645, + "epoch": 0.11, + "learning_rate": 5.705832628909552e-06, + "loss": 11.0271, + "step": 135, + "task_loss": 4.45792293548584 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 10.76945972442627, + "epoch": 0.11, + "learning_rate": 5.748098055790363e-06, + "loss": 10.5237, + "step": 136, + "task_loss": 4.52976655960083 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 12.669245719909668, + "epoch": 0.12, + "learning_rate": 5.790363482671175e-06, + "loss": 10.5259, + "step": 137, + "task_loss": 4.511288642883301 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 10.659748077392578, + "epoch": 0.12, + "learning_rate": 5.832628909551987e-06, + "loss": 10.0496, + "step": 138, + "task_loss": 4.538107395172119 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 12.318220138549805, + "epoch": 0.12, + "learning_rate": 5.874894336432798e-06, + "loss": 11.0436, + "step": 139, + "task_loss": 4.42420768737793 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 9.603837966918945, + "epoch": 0.12, + "learning_rate": 5.917159763313609e-06, + "loss": 11.0026, + "step": 140, + "task_loss": 4.447585582733154 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 9.475722312927246, + "epoch": 0.12, + "learning_rate": 5.959425190194421e-06, + "loss": 9.663, + "step": 141, + "task_loss": 4.526320457458496 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 10.151780128479004, + "epoch": 0.12, + "learning_rate": 6.0016906170752324e-06, + "loss": 10.0851, + "step": 142, + "task_loss": 4.430243492126465 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 10.196475982666016, + "epoch": 0.12, + "learning_rate": 6.043956043956044e-06, + "loss": 10.9937, + "step": 143, + "task_loss": 4.319927215576172 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 11.272387504577637, + "epoch": 0.12, + "learning_rate": 6.086221470836856e-06, + "loss": 10.9631, + "step": 144, + "task_loss": 4.487193584442139 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 12.366275787353516, + "epoch": 0.12, + "learning_rate": 6.128486897717667e-06, + "loss": 10.8045, + "step": 145, + "task_loss": 4.4568257331848145 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 9.08237075805664, + "epoch": 0.12, + "learning_rate": 6.1707523245984785e-06, + "loss": 10.6496, + "step": 146, + "task_loss": 4.381575584411621 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 11.435718536376953, + "epoch": 0.12, + "learning_rate": 6.21301775147929e-06, + "loss": 10.4635, + "step": 147, + "task_loss": 4.340198040008545 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 10.488754272460938, + "epoch": 0.13, + "learning_rate": 6.255283178360102e-06, + "loss": 10.2172, + "step": 148, + "task_loss": 4.461639881134033 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 11.088775634765625, + "epoch": 0.13, + "learning_rate": 6.297548605240913e-06, + "loss": 10.1736, + "step": 149, + "task_loss": 4.251922607421875 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 11.592573165893555, + "epoch": 0.13, + "learning_rate": 6.3398140321217245e-06, + "loss": 10.8346, + "step": 150, + "task_loss": 4.493696689605713 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 9.482400894165039, + "epoch": 0.13, + "learning_rate": 6.382079459002536e-06, + "loss": 9.5301, + "step": 151, + "task_loss": 4.568138599395752 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 12.23709487915039, + "epoch": 0.13, + "learning_rate": 6.4243448858833475e-06, + "loss": 10.5568, + "step": 152, + "task_loss": 4.415182113647461 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 11.710551261901855, + "epoch": 0.13, + "learning_rate": 6.4666103127641594e-06, + "loss": 10.2834, + "step": 153, + "task_loss": 4.313751697540283 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 10.814706802368164, + "epoch": 0.13, + "learning_rate": 6.5088757396449705e-06, + "loss": 9.9942, + "step": 154, + "task_loss": 4.347652912139893 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 8.893689155578613, + "epoch": 0.13, + "learning_rate": 6.551141166525782e-06, + "loss": 10.0578, + "step": 155, + "task_loss": 4.3386077880859375 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 11.638946533203125, + "epoch": 0.13, + "learning_rate": 6.5934065934065935e-06, + "loss": 10.1688, + "step": 156, + "task_loss": 4.252321243286133 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 10.796585083007812, + "epoch": 0.13, + "learning_rate": 6.635672020287405e-06, + "loss": 9.85, + "step": 157, + "task_loss": 4.514593601226807 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 12.203529357910156, + "epoch": 0.13, + "learning_rate": 6.677937447168217e-06, + "loss": 10.4681, + "step": 158, + "task_loss": 4.236311435699463 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 13.074034690856934, + "epoch": 0.13, + "learning_rate": 6.720202874049028e-06, + "loss": 10.5984, + "step": 159, + "task_loss": 4.2907304763793945 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 11.61819839477539, + "epoch": 0.14, + "learning_rate": 6.76246830092984e-06, + "loss": 10.3344, + "step": 160, + "task_loss": 4.206151008605957 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 11.346233367919922, + "epoch": 0.14, + "learning_rate": 6.8047337278106515e-06, + "loss": 10.336, + "step": 161, + "task_loss": 4.290561676025391 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 13.287874221801758, + "epoch": 0.14, + "learning_rate": 6.846999154691463e-06, + "loss": 10.933, + "step": 162, + "task_loss": 4.171278953552246 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 11.964640617370605, + "epoch": 0.14, + "learning_rate": 6.8892645815722745e-06, + "loss": 10.8671, + "step": 163, + "task_loss": 4.706011772155762 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 10.779141426086426, + "epoch": 0.14, + "learning_rate": 6.931530008453086e-06, + "loss": 9.953, + "step": 164, + "task_loss": 4.320837020874023 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 10.711145401000977, + "epoch": 0.14, + "learning_rate": 6.9737954353338975e-06, + "loss": 10.4274, + "step": 165, + "task_loss": 4.2374982833862305 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 11.074871063232422, + "epoch": 0.14, + "learning_rate": 7.016060862214709e-06, + "loss": 10.7103, + "step": 166, + "task_loss": 4.251299858093262 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 10.305875778198242, + "epoch": 0.14, + "learning_rate": 7.05832628909552e-06, + "loss": 10.3059, + "step": 167, + "task_loss": 4.345815658569336 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 10.87346363067627, + "epoch": 0.14, + "learning_rate": 7.100591715976332e-06, + "loss": 9.5595, + "step": 168, + "task_loss": 4.096438407897949 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 11.741791725158691, + "epoch": 0.14, + "learning_rate": 7.142857142857143e-06, + "loss": 11.1149, + "step": 169, + "task_loss": 4.1396074295043945 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 12.619377136230469, + "epoch": 0.14, + "learning_rate": 7.1851225697379555e-06, + "loss": 11.1664, + "step": 170, + "task_loss": 4.154287815093994 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 10.982399940490723, + "epoch": 0.14, + "learning_rate": 7.227387996618766e-06, + "loss": 10.2731, + "step": 171, + "task_loss": 4.215027332305908 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 11.182270050048828, + "epoch": 0.15, + "learning_rate": 7.2696534234995785e-06, + "loss": 11.0735, + "step": 172, + "task_loss": 4.028750419616699 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 9.805275917053223, + "epoch": 0.15, + "learning_rate": 7.31191885038039e-06, + "loss": 10.3169, + "step": 173, + "task_loss": 4.118965148925781 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 7.965993881225586, + "epoch": 0.15, + "learning_rate": 7.3541842772612e-06, + "loss": 9.3504, + "step": 174, + "task_loss": 4.2412614822387695 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 10.746776580810547, + "epoch": 0.15, + "learning_rate": 7.396449704142013e-06, + "loss": 10.7342, + "step": 175, + "task_loss": 4.341203689575195 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 10.264242172241211, + "epoch": 0.15, + "learning_rate": 7.438715131022824e-06, + "loss": 9.9874, + "step": 176, + "task_loss": 4.498307228088379 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 9.640728950500488, + "epoch": 0.15, + "learning_rate": 7.480980557903636e-06, + "loss": 9.1746, + "step": 177, + "task_loss": 4.313640594482422 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 10.954181671142578, + "epoch": 0.15, + "learning_rate": 7.523245984784447e-06, + "loss": 10.0101, + "step": 178, + "task_loss": 4.221097469329834 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 11.383469581604004, + "epoch": 0.15, + "learning_rate": 7.565511411665258e-06, + "loss": 10.2038, + "step": 179, + "task_loss": 4.190242767333984 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 9.425771713256836, + "epoch": 0.15, + "learning_rate": 7.60777683854607e-06, + "loss": 9.2526, + "step": 180, + "task_loss": 4.167534828186035 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 11.275524139404297, + "epoch": 0.15, + "learning_rate": 7.65004226542688e-06, + "loss": 9.8853, + "step": 181, + "task_loss": 4.186546325683594 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 9.3565673828125, + "epoch": 0.15, + "learning_rate": 7.692307692307694e-06, + "loss": 10.179, + "step": 182, + "task_loss": 4.147878170013428 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 10.274319648742676, + "epoch": 0.15, + "learning_rate": 7.734573119188505e-06, + "loss": 9.8514, + "step": 183, + "task_loss": 4.218626022338867 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 10.793763160705566, + "epoch": 0.16, + "learning_rate": 7.776838546069316e-06, + "loss": 9.6777, + "step": 184, + "task_loss": 4.267410755157471 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 11.20872688293457, + "epoch": 0.16, + "learning_rate": 7.819103972950127e-06, + "loss": 9.9578, + "step": 185, + "task_loss": 4.374319076538086 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 10.686092376708984, + "epoch": 0.16, + "learning_rate": 7.861369399830938e-06, + "loss": 9.45, + "step": 186, + "task_loss": 3.8152966499328613 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 11.13048267364502, + "epoch": 0.16, + "learning_rate": 7.90363482671175e-06, + "loss": 10.4769, + "step": 187, + "task_loss": 4.035782337188721 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 10.271520614624023, + "epoch": 0.16, + "learning_rate": 7.945900253592562e-06, + "loss": 10.5395, + "step": 188, + "task_loss": 4.126121997833252 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 10.013580322265625, + "epoch": 0.16, + "learning_rate": 7.988165680473373e-06, + "loss": 9.9373, + "step": 189, + "task_loss": 4.293776035308838 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 10.108755111694336, + "epoch": 0.16, + "learning_rate": 8.030431107354184e-06, + "loss": 10.2416, + "step": 190, + "task_loss": 3.8266375064849854 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 12.703033447265625, + "epoch": 0.16, + "learning_rate": 8.072696534234995e-06, + "loss": 11.1518, + "step": 191, + "task_loss": 4.055019378662109 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 9.374983787536621, + "epoch": 0.16, + "learning_rate": 8.114961961115808e-06, + "loss": 10.4757, + "step": 192, + "task_loss": 4.20020055770874 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 9.990991592407227, + "epoch": 0.16, + "learning_rate": 8.157227387996619e-06, + "loss": 9.7356, + "step": 193, + "task_loss": 4.024471759796143 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 9.545437812805176, + "epoch": 0.16, + "learning_rate": 8.199492814877432e-06, + "loss": 9.5969, + "step": 194, + "task_loss": 4.375448226928711 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 11.055362701416016, + "epoch": 0.16, + "learning_rate": 8.241758241758243e-06, + "loss": 10.1901, + "step": 195, + "task_loss": 4.030673027038574 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 10.091379165649414, + "epoch": 0.17, + "learning_rate": 8.284023668639054e-06, + "loss": 9.8951, + "step": 196, + "task_loss": 4.250060558319092 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 10.485389709472656, + "epoch": 0.17, + "learning_rate": 8.326289095519865e-06, + "loss": 9.8495, + "step": 197, + "task_loss": 4.061662673950195 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 10.825084686279297, + "epoch": 0.17, + "learning_rate": 8.368554522400676e-06, + "loss": 9.6397, + "step": 198, + "task_loss": 4.07577657699585 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 9.405561447143555, + "epoch": 0.17, + "learning_rate": 8.410819949281489e-06, + "loss": 9.7127, + "step": 199, + "task_loss": 4.1694231033325195 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 11.09040355682373, + "epoch": 0.17, + "learning_rate": 8.4530853761623e-06, + "loss": 9.645, + "step": 200, + "task_loss": 4.187140464782715 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 10.289454460144043, + "epoch": 0.17, + "learning_rate": 8.495350803043111e-06, + "loss": 10.6047, + "step": 201, + "task_loss": 4.169828414916992 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 10.432872772216797, + "epoch": 0.17, + "learning_rate": 8.537616229923922e-06, + "loss": 9.7342, + "step": 202, + "task_loss": 3.9641635417938232 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 11.453047752380371, + "epoch": 0.17, + "learning_rate": 8.579881656804733e-06, + "loss": 9.718, + "step": 203, + "task_loss": 3.986605405807495 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 11.925058364868164, + "epoch": 0.17, + "learning_rate": 8.622147083685546e-06, + "loss": 10.1314, + "step": 204, + "task_loss": 3.8888165950775146 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 10.727497100830078, + "epoch": 0.17, + "learning_rate": 8.664412510566357e-06, + "loss": 9.6088, + "step": 205, + "task_loss": 3.9736809730529785 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 10.890064239501953, + "epoch": 0.17, + "learning_rate": 8.70667793744717e-06, + "loss": 9.1494, + "step": 206, + "task_loss": 3.6250922679901123 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 9.582391738891602, + "epoch": 0.17, + "learning_rate": 8.74894336432798e-06, + "loss": 9.3244, + "step": 207, + "task_loss": 4.294853210449219 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 10.410274505615234, + "epoch": 0.18, + "learning_rate": 8.791208791208792e-06, + "loss": 8.9835, + "step": 208, + "task_loss": 4.244947910308838 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 9.908506393432617, + "epoch": 0.18, + "learning_rate": 8.833474218089603e-06, + "loss": 9.7579, + "step": 209, + "task_loss": 4.046106815338135 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 10.048578262329102, + "epoch": 0.18, + "learning_rate": 8.875739644970414e-06, + "loss": 9.7513, + "step": 210, + "task_loss": 3.86310076713562 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 10.475410461425781, + "epoch": 0.18, + "learning_rate": 8.918005071851227e-06, + "loss": 9.129, + "step": 211, + "task_loss": 3.783229351043701 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 10.169841766357422, + "epoch": 0.18, + "learning_rate": 8.960270498732038e-06, + "loss": 9.9547, + "step": 212, + "task_loss": 3.966848611831665 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 10.033048629760742, + "epoch": 0.18, + "learning_rate": 9.002535925612849e-06, + "loss": 9.4696, + "step": 213, + "task_loss": 3.591090202331543 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 8.172948837280273, + "epoch": 0.18, + "learning_rate": 9.04480135249366e-06, + "loss": 8.8152, + "step": 214, + "task_loss": 3.9587864875793457 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 10.5323486328125, + "epoch": 0.18, + "learning_rate": 9.087066779374471e-06, + "loss": 9.4885, + "step": 215, + "task_loss": 3.749351978302002 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 7.818955421447754, + "epoch": 0.18, + "learning_rate": 9.129332206255284e-06, + "loss": 9.8823, + "step": 216, + "task_loss": 3.7121710777282715 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 8.067737579345703, + "epoch": 0.18, + "learning_rate": 9.171597633136095e-06, + "loss": 8.6905, + "step": 217, + "task_loss": 4.03646183013916 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 9.002960205078125, + "epoch": 0.18, + "learning_rate": 9.213863060016908e-06, + "loss": 9.009, + "step": 218, + "task_loss": 3.9315176010131836 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 10.221734046936035, + "epoch": 0.19, + "learning_rate": 9.256128486897717e-06, + "loss": 8.491, + "step": 219, + "task_loss": 3.6113595962524414 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 9.229199409484863, + "epoch": 0.19, + "learning_rate": 9.298393913778528e-06, + "loss": 8.931, + "step": 220, + "task_loss": 3.6735680103302 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 10.432989120483398, + "epoch": 0.19, + "learning_rate": 9.340659340659341e-06, + "loss": 9.4439, + "step": 221, + "task_loss": 3.8682448863983154 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 9.591475486755371, + "epoch": 0.19, + "learning_rate": 9.382924767540152e-06, + "loss": 8.6367, + "step": 222, + "task_loss": 3.29972505569458 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 8.7122802734375, + "epoch": 0.19, + "learning_rate": 9.425190194420965e-06, + "loss": 8.6281, + "step": 223, + "task_loss": 3.6467082500457764 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 9.89112663269043, + "epoch": 0.19, + "learning_rate": 9.467455621301776e-06, + "loss": 9.0989, + "step": 224, + "task_loss": 3.9356582164764404 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 8.698701858520508, + "epoch": 0.19, + "learning_rate": 9.509721048182587e-06, + "loss": 8.742, + "step": 225, + "task_loss": 3.239922046661377 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 10.923625946044922, + "epoch": 0.19, + "learning_rate": 9.551986475063398e-06, + "loss": 9.9672, + "step": 226, + "task_loss": 3.6118457317352295 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 10.537036895751953, + "epoch": 0.19, + "learning_rate": 9.59425190194421e-06, + "loss": 9.6852, + "step": 227, + "task_loss": 3.727559804916382 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 10.187715530395508, + "epoch": 0.19, + "learning_rate": 9.636517328825022e-06, + "loss": 9.2675, + "step": 228, + "task_loss": 3.956024408340454 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 8.77787971496582, + "epoch": 0.19, + "learning_rate": 9.678782755705833e-06, + "loss": 8.7379, + "step": 229, + "task_loss": 3.750727891921997 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 8.334738731384277, + "epoch": 0.19, + "learning_rate": 9.721048182586644e-06, + "loss": 9.2216, + "step": 230, + "task_loss": 4.032806396484375 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 10.017505645751953, + "epoch": 0.2, + "learning_rate": 9.763313609467455e-06, + "loss": 9.6687, + "step": 231, + "task_loss": 3.8098485469818115 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 9.59679889678955, + "epoch": 0.2, + "learning_rate": 9.805579036348266e-06, + "loss": 9.1146, + "step": 232, + "task_loss": 3.5098395347595215 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 10.5515718460083, + "epoch": 0.2, + "learning_rate": 9.84784446322908e-06, + "loss": 9.2432, + "step": 233, + "task_loss": 3.9476349353790283 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 9.95006275177002, + "epoch": 0.2, + "learning_rate": 9.89010989010989e-06, + "loss": 9.017, + "step": 234, + "task_loss": 3.2772388458251953 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 10.057106018066406, + "epoch": 0.2, + "learning_rate": 9.932375316990703e-06, + "loss": 9.0479, + "step": 235, + "task_loss": 3.8597865104675293 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 10.000930786132812, + "epoch": 0.2, + "learning_rate": 9.974640743871514e-06, + "loss": 8.7053, + "step": 236, + "task_loss": 3.56473445892334 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 9.533687591552734, + "epoch": 0.2, + "learning_rate": 1.0016906170752325e-05, + "loss": 9.0735, + "step": 237, + "task_loss": 3.6946725845336914 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 9.793607711791992, + "epoch": 0.2, + "learning_rate": 1.0059171597633136e-05, + "loss": 8.6857, + "step": 238, + "task_loss": 3.6462836265563965 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 8.320764541625977, + "epoch": 0.2, + "learning_rate": 1.0101437024513947e-05, + "loss": 9.093, + "step": 239, + "task_loss": 3.744810104370117 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 9.179412841796875, + "epoch": 0.2, + "learning_rate": 1.014370245139476e-05, + "loss": 8.8603, + "step": 240, + "task_loss": 3.747973918914795 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 9.077903747558594, + "epoch": 0.2, + "learning_rate": 1.0185967878275571e-05, + "loss": 9.1231, + "step": 241, + "task_loss": 3.7238407135009766 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 9.257226943969727, + "epoch": 0.2, + "learning_rate": 1.0228233305156382e-05, + "loss": 8.9915, + "step": 242, + "task_loss": 3.587797164916992 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 9.561230659484863, + "epoch": 0.21, + "learning_rate": 1.0270498732037193e-05, + "loss": 8.7602, + "step": 243, + "task_loss": 3.36590576171875 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 10.141240119934082, + "epoch": 0.21, + "learning_rate": 1.0312764158918005e-05, + "loss": 8.9407, + "step": 244, + "task_loss": 3.527395248413086 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 9.287521362304688, + "epoch": 0.21, + "learning_rate": 1.0355029585798817e-05, + "loss": 8.4062, + "step": 245, + "task_loss": 3.1535964012145996 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 9.89594841003418, + "epoch": 0.21, + "learning_rate": 1.0397295012679628e-05, + "loss": 8.7649, + "step": 246, + "task_loss": 3.2914223670959473 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 8.468308448791504, + "epoch": 0.21, + "learning_rate": 1.0439560439560441e-05, + "loss": 9.0471, + "step": 247, + "task_loss": 3.190307140350342 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 10.224224090576172, + "epoch": 0.21, + "learning_rate": 1.048182586644125e-05, + "loss": 8.7487, + "step": 248, + "task_loss": 3.277031660079956 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 10.15646743774414, + "epoch": 0.21, + "learning_rate": 1.0524091293322063e-05, + "loss": 9.5886, + "step": 249, + "task_loss": 3.405691385269165 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 9.508550643920898, + "epoch": 0.21, + "learning_rate": 1.0566356720202874e-05, + "loss": 8.9093, + "step": 250, + "task_loss": 3.5275111198425293 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 8.746103286743164, + "epoch": 0.21, + "learning_rate": 1.0608622147083686e-05, + "loss": 8.2074, + "step": 251, + "task_loss": 3.4609551429748535 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 9.537322998046875, + "epoch": 0.21, + "learning_rate": 1.0650887573964498e-05, + "loss": 9.0778, + "step": 252, + "task_loss": 3.7272391319274902 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 8.144298553466797, + "epoch": 0.21, + "learning_rate": 1.069315300084531e-05, + "loss": 7.3586, + "step": 253, + "task_loss": 3.3878133296966553 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 8.603456497192383, + "epoch": 0.21, + "learning_rate": 1.073541842772612e-05, + "loss": 8.6143, + "step": 254, + "task_loss": 3.531137704849243 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 8.22976303100586, + "epoch": 0.22, + "learning_rate": 1.0777683854606932e-05, + "loss": 8.3453, + "step": 255, + "task_loss": 2.8870022296905518 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 7.45205545425415, + "epoch": 0.22, + "learning_rate": 1.0819949281487743e-05, + "loss": 8.4611, + "step": 256, + "task_loss": 3.4483728408813477 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 9.165212631225586, + "epoch": 0.22, + "learning_rate": 1.0862214708368555e-05, + "loss": 8.0376, + "step": 257, + "task_loss": 3.178316593170166 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 9.161018371582031, + "epoch": 0.22, + "learning_rate": 1.0904480135249366e-05, + "loss": 8.0121, + "step": 258, + "task_loss": 3.801513433456421 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 6.667741775512695, + "epoch": 0.22, + "learning_rate": 1.094674556213018e-05, + "loss": 7.6147, + "step": 259, + "task_loss": 3.5329809188842773 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 9.509736061096191, + "epoch": 0.22, + "learning_rate": 1.0989010989010989e-05, + "loss": 8.6583, + "step": 260, + "task_loss": 3.0156421661376953 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 9.281269073486328, + "epoch": 0.22, + "learning_rate": 1.1031276415891801e-05, + "loss": 8.7036, + "step": 261, + "task_loss": 3.0013539791107178 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 9.716171264648438, + "epoch": 0.22, + "learning_rate": 1.1073541842772613e-05, + "loss": 8.8486, + "step": 262, + "task_loss": 3.244210720062256 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 8.68092155456543, + "epoch": 0.22, + "learning_rate": 1.1115807269653424e-05, + "loss": 8.0044, + "step": 263, + "task_loss": 3.2940123081207275 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 10.401957511901855, + "epoch": 0.22, + "learning_rate": 1.1158072696534236e-05, + "loss": 8.7166, + "step": 264, + "task_loss": 3.208735704421997 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 8.635251998901367, + "epoch": 0.22, + "learning_rate": 1.1200338123415047e-05, + "loss": 8.0107, + "step": 265, + "task_loss": 3.4525182247161865 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 8.818111419677734, + "epoch": 0.22, + "learning_rate": 1.1242603550295859e-05, + "loss": 7.7175, + "step": 266, + "task_loss": 2.727339744567871 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 8.090768814086914, + "epoch": 0.23, + "learning_rate": 1.128486897717667e-05, + "loss": 7.9907, + "step": 267, + "task_loss": 3.7688605785369873 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 8.088828086853027, + "epoch": 0.23, + "learning_rate": 1.132713440405748e-05, + "loss": 8.2883, + "step": 268, + "task_loss": 3.1267666816711426 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 7.61367130279541, + "epoch": 0.23, + "learning_rate": 1.1369399830938294e-05, + "loss": 7.5504, + "step": 269, + "task_loss": 3.1898815631866455 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 8.58709716796875, + "epoch": 0.23, + "learning_rate": 1.1411665257819105e-05, + "loss": 8.4225, + "step": 270, + "task_loss": 2.7470362186431885 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 8.159021377563477, + "epoch": 0.23, + "learning_rate": 1.1453930684699916e-05, + "loss": 7.9497, + "step": 271, + "task_loss": 3.456969976425171 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 9.73142147064209, + "epoch": 0.23, + "learning_rate": 1.1496196111580727e-05, + "loss": 7.9502, + "step": 272, + "task_loss": 3.209144353866577 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 9.608966827392578, + "epoch": 0.23, + "learning_rate": 1.153846153846154e-05, + "loss": 8.5023, + "step": 273, + "task_loss": 3.067208766937256 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 9.067193984985352, + "epoch": 0.23, + "learning_rate": 1.158072696534235e-05, + "loss": 8.4317, + "step": 274, + "task_loss": 2.8097782135009766 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 9.70694637298584, + "epoch": 0.23, + "learning_rate": 1.1622992392223162e-05, + "loss": 8.8497, + "step": 275, + "task_loss": 3.1005516052246094 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 8.506094932556152, + "epoch": 0.23, + "learning_rate": 1.1665257819103974e-05, + "loss": 7.9693, + "step": 276, + "task_loss": 3.0219662189483643 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 10.111152648925781, + "epoch": 0.23, + "learning_rate": 1.1707523245984786e-05, + "loss": 8.1954, + "step": 277, + "task_loss": 3.1629347801208496 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 7.7543840408325195, + "epoch": 0.23, + "learning_rate": 1.1749788672865597e-05, + "loss": 7.7928, + "step": 278, + "task_loss": 3.210057020187378 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 9.188498497009277, + "epoch": 0.24, + "learning_rate": 1.1792054099746408e-05, + "loss": 8.309, + "step": 279, + "task_loss": 3.192016839981079 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 8.26401138305664, + "epoch": 0.24, + "learning_rate": 1.1834319526627219e-05, + "loss": 8.2512, + "step": 280, + "task_loss": 3.2035396099090576 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 8.50117301940918, + "epoch": 0.24, + "learning_rate": 1.1876584953508032e-05, + "loss": 7.7436, + "step": 281, + "task_loss": 3.1840407848358154 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 8.73055362701416, + "epoch": 0.24, + "learning_rate": 1.1918850380388843e-05, + "loss": 8.3993, + "step": 282, + "task_loss": 3.3466413021087646 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 8.744690895080566, + "epoch": 0.24, + "learning_rate": 1.1961115807269654e-05, + "loss": 8.0616, + "step": 283, + "task_loss": 2.9450626373291016 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 8.223235130310059, + "epoch": 0.24, + "learning_rate": 1.2003381234150465e-05, + "loss": 7.4737, + "step": 284, + "task_loss": 2.517256736755371 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 7.263904571533203, + "epoch": 0.24, + "learning_rate": 1.2045646661031278e-05, + "loss": 7.2453, + "step": 285, + "task_loss": 3.256157875061035 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 7.5834479331970215, + "epoch": 0.24, + "learning_rate": 1.2087912087912089e-05, + "loss": 7.5838, + "step": 286, + "task_loss": 2.9883928298950195 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 9.039785385131836, + "epoch": 0.24, + "learning_rate": 1.21301775147929e-05, + "loss": 7.7269, + "step": 287, + "task_loss": 2.9204177856445312 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 7.620002746582031, + "epoch": 0.24, + "learning_rate": 1.2172442941673713e-05, + "loss": 7.6982, + "step": 288, + "task_loss": 3.3289504051208496 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 6.330410480499268, + "epoch": 0.24, + "learning_rate": 1.2214708368554522e-05, + "loss": 6.6127, + "step": 289, + "task_loss": 3.1552486419677734 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 7.199276924133301, + "epoch": 0.24, + "learning_rate": 1.2256973795435335e-05, + "loss": 6.9329, + "step": 290, + "task_loss": 2.586980104446411 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 7.269084453582764, + "epoch": 0.25, + "learning_rate": 1.2299239222316146e-05, + "loss": 7.3932, + "step": 291, + "task_loss": 3.4052515029907227 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 7.650925636291504, + "epoch": 0.25, + "learning_rate": 1.2341504649196957e-05, + "loss": 7.279, + "step": 292, + "task_loss": 2.772911310195923 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 7.6374993324279785, + "epoch": 0.25, + "learning_rate": 1.238377007607777e-05, + "loss": 6.8702, + "step": 293, + "task_loss": 2.59550142288208 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 8.041665077209473, + "epoch": 0.25, + "learning_rate": 1.242603550295858e-05, + "loss": 7.3327, + "step": 294, + "task_loss": 2.7869927883148193 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 9.058813095092773, + "epoch": 0.25, + "learning_rate": 1.2468300929839392e-05, + "loss": 7.486, + "step": 295, + "task_loss": 2.856990337371826 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 8.69514274597168, + "epoch": 0.25, + "learning_rate": 1.2510566356720205e-05, + "loss": 7.4231, + "step": 296, + "task_loss": 3.403477907180786 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 7.02663516998291, + "epoch": 0.25, + "learning_rate": 1.2552831783601016e-05, + "loss": 7.0087, + "step": 297, + "task_loss": 2.9371819496154785 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 7.265678882598877, + "epoch": 0.25, + "learning_rate": 1.2595097210481827e-05, + "loss": 7.0669, + "step": 298, + "task_loss": 2.465744733810425 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 7.6635541915893555, + "epoch": 0.25, + "learning_rate": 1.2637362637362638e-05, + "loss": 7.1107, + "step": 299, + "task_loss": 2.632458209991455 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 7.309359073638916, + "epoch": 0.25, + "learning_rate": 1.2679628064243449e-05, + "loss": 7.4685, + "step": 300, + "task_loss": 3.1758174896240234 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 8.924592971801758, + "epoch": 0.25, + "learning_rate": 1.2721893491124262e-05, + "loss": 7.228, + "step": 301, + "task_loss": 2.345716714859009 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 7.444634437561035, + "epoch": 0.26, + "learning_rate": 1.2764158918005073e-05, + "loss": 7.2159, + "step": 302, + "task_loss": 2.6961519718170166 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 7.346635818481445, + "epoch": 0.26, + "learning_rate": 1.2806424344885884e-05, + "loss": 6.77, + "step": 303, + "task_loss": 2.6522417068481445 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 7.010703086853027, + "epoch": 0.26, + "learning_rate": 1.2848689771766695e-05, + "loss": 7.0704, + "step": 304, + "task_loss": 3.018387794494629 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 7.830002784729004, + "epoch": 0.26, + "learning_rate": 1.2890955198647506e-05, + "loss": 7.4245, + "step": 305, + "task_loss": 3.183865547180176 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 7.979141712188721, + "epoch": 0.26, + "learning_rate": 1.2933220625528319e-05, + "loss": 7.3239, + "step": 306, + "task_loss": 2.4571142196655273 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 6.972380638122559, + "epoch": 0.26, + "learning_rate": 1.297548605240913e-05, + "loss": 7.2231, + "step": 307, + "task_loss": 2.3960916996002197 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 6.064640522003174, + "epoch": 0.26, + "learning_rate": 1.3017751479289941e-05, + "loss": 6.6009, + "step": 308, + "task_loss": 2.6419718265533447 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 6.610091686248779, + "epoch": 0.26, + "learning_rate": 1.3060016906170752e-05, + "loss": 7.2452, + "step": 309, + "task_loss": 2.8806211948394775 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 7.081999778747559, + "epoch": 0.26, + "learning_rate": 1.3102282333051563e-05, + "loss": 6.1722, + "step": 310, + "task_loss": 2.75994873046875 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 8.020060539245605, + "epoch": 0.26, + "learning_rate": 1.3144547759932378e-05, + "loss": 6.9228, + "step": 311, + "task_loss": 2.7260892391204834 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 8.168285369873047, + "epoch": 0.26, + "learning_rate": 1.3186813186813187e-05, + "loss": 7.0175, + "step": 312, + "task_loss": 2.3912341594696045 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 6.537705421447754, + "epoch": 0.26, + "learning_rate": 1.3229078613693998e-05, + "loss": 7.0273, + "step": 313, + "task_loss": 2.5945322513580322 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 8.620354652404785, + "epoch": 0.27, + "learning_rate": 1.327134404057481e-05, + "loss": 7.0173, + "step": 314, + "task_loss": 3.1213581562042236 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 7.438002109527588, + "epoch": 0.27, + "learning_rate": 1.3313609467455624e-05, + "loss": 6.332, + "step": 315, + "task_loss": 2.336137533187866 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 7.507883071899414, + "epoch": 0.27, + "learning_rate": 1.3355874894336435e-05, + "loss": 6.9275, + "step": 316, + "task_loss": 2.363334894180298 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 5.7529296875, + "epoch": 0.27, + "learning_rate": 1.3398140321217246e-05, + "loss": 6.9245, + "step": 317, + "task_loss": 2.8360331058502197 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 6.88886833190918, + "epoch": 0.27, + "learning_rate": 1.3440405748098055e-05, + "loss": 6.1753, + "step": 318, + "task_loss": 2.483402729034424 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 8.032519340515137, + "epoch": 0.27, + "learning_rate": 1.3482671174978866e-05, + "loss": 7.0717, + "step": 319, + "task_loss": 3.0559518337249756 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 7.813842296600342, + "epoch": 0.27, + "learning_rate": 1.352493660185968e-05, + "loss": 7.3942, + "step": 320, + "task_loss": 2.770368814468384 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 7.590816497802734, + "epoch": 0.27, + "learning_rate": 1.3567202028740492e-05, + "loss": 6.1288, + "step": 321, + "task_loss": 2.619555711746216 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 5.410475730895996, + "epoch": 0.27, + "learning_rate": 1.3609467455621303e-05, + "loss": 6.5503, + "step": 322, + "task_loss": 2.099933385848999 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 6.537872314453125, + "epoch": 0.27, + "learning_rate": 1.3651732882502114e-05, + "loss": 7.053, + "step": 323, + "task_loss": 3.1748578548431396 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 7.0810041427612305, + "epoch": 0.27, + "learning_rate": 1.3693998309382925e-05, + "loss": 6.0314, + "step": 324, + "task_loss": 2.4908924102783203 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 5.902927875518799, + "epoch": 0.27, + "learning_rate": 1.3736263736263738e-05, + "loss": 6.7185, + "step": 325, + "task_loss": 1.9620531797409058 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 5.460475444793701, + "epoch": 0.28, + "learning_rate": 1.3778529163144549e-05, + "loss": 5.9992, + "step": 326, + "task_loss": 2.396761894226074 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 6.233303070068359, + "epoch": 0.28, + "learning_rate": 1.382079459002536e-05, + "loss": 6.755, + "step": 327, + "task_loss": 2.5932676792144775 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 7.193846702575684, + "epoch": 0.28, + "learning_rate": 1.3863060016906171e-05, + "loss": 6.4138, + "step": 328, + "task_loss": 2.349155902862549 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 7.386889457702637, + "epoch": 0.28, + "learning_rate": 1.3905325443786982e-05, + "loss": 6.4544, + "step": 329, + "task_loss": 2.641763687133789 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 5.899024963378906, + "epoch": 0.28, + "learning_rate": 1.3947590870667795e-05, + "loss": 6.3243, + "step": 330, + "task_loss": 2.0411782264709473 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 6.233712673187256, + "epoch": 0.28, + "learning_rate": 1.3989856297548606e-05, + "loss": 6.0765, + "step": 331, + "task_loss": 2.0285537242889404 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 5.895257949829102, + "epoch": 0.28, + "learning_rate": 1.4032121724429417e-05, + "loss": 6.289, + "step": 332, + "task_loss": 2.523454189300537 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 6.672489166259766, + "epoch": 0.28, + "learning_rate": 1.4074387151310228e-05, + "loss": 6.4813, + "step": 333, + "task_loss": 1.8559277057647705 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 5.948751449584961, + "epoch": 0.28, + "learning_rate": 1.411665257819104e-05, + "loss": 6.1338, + "step": 334, + "task_loss": 2.6344242095947266 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 7.425365447998047, + "epoch": 0.28, + "learning_rate": 1.4158918005071852e-05, + "loss": 7.0047, + "step": 335, + "task_loss": 2.5383880138397217 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 6.567205429077148, + "epoch": 0.28, + "learning_rate": 1.4201183431952663e-05, + "loss": 6.0035, + "step": 336, + "task_loss": 2.2867391109466553 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 6.570411205291748, + "epoch": 0.28, + "learning_rate": 1.4243448858833474e-05, + "loss": 5.5269, + "step": 337, + "task_loss": 2.302530527114868 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 5.863937854766846, + "epoch": 0.29, + "learning_rate": 1.4285714285714285e-05, + "loss": 6.2082, + "step": 338, + "task_loss": 2.3930721282958984 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 6.419766426086426, + "epoch": 0.29, + "learning_rate": 1.4327979712595097e-05, + "loss": 5.9672, + "step": 339, + "task_loss": 2.5264058113098145 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 5.894221782684326, + "epoch": 0.29, + "learning_rate": 1.4370245139475911e-05, + "loss": 5.7745, + "step": 340, + "task_loss": 2.48919939994812 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 6.262081623077393, + "epoch": 0.29, + "learning_rate": 1.441251056635672e-05, + "loss": 5.8928, + "step": 341, + "task_loss": 2.296867847442627 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 8.025496482849121, + "epoch": 0.29, + "learning_rate": 1.4454775993237531e-05, + "loss": 6.122, + "step": 342, + "task_loss": 2.5754010677337646 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 6.355745315551758, + "epoch": 0.29, + "learning_rate": 1.4497041420118343e-05, + "loss": 6.1599, + "step": 343, + "task_loss": 2.6352131366729736 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 6.904348373413086, + "epoch": 0.29, + "learning_rate": 1.4539306846999157e-05, + "loss": 6.0372, + "step": 344, + "task_loss": 2.200890302658081 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 6.383485317230225, + "epoch": 0.29, + "learning_rate": 1.4581572273879968e-05, + "loss": 5.7256, + "step": 345, + "task_loss": 2.2610771656036377 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 5.447573661804199, + "epoch": 0.29, + "learning_rate": 1.462383770076078e-05, + "loss": 5.5437, + "step": 346, + "task_loss": 2.330890655517578 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 5.429883003234863, + "epoch": 0.29, + "learning_rate": 1.466610312764159e-05, + "loss": 5.7613, + "step": 347, + "task_loss": 2.0712246894836426 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 6.443944931030273, + "epoch": 0.29, + "learning_rate": 1.47083685545224e-05, + "loss": 5.7443, + "step": 348, + "task_loss": 2.4881842136383057 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 6.506386756896973, + "epoch": 0.29, + "learning_rate": 1.4750633981403214e-05, + "loss": 5.4517, + "step": 349, + "task_loss": 1.6730608940124512 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 5.357576847076416, + "epoch": 0.3, + "learning_rate": 1.4792899408284025e-05, + "loss": 4.9021, + "step": 350, + "task_loss": 2.2803804874420166 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 5.611043453216553, + "epoch": 0.3, + "learning_rate": 1.4835164835164836e-05, + "loss": 5.723, + "step": 351, + "task_loss": 1.7297115325927734 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 6.749140739440918, + "epoch": 0.3, + "learning_rate": 1.4877430262045647e-05, + "loss": 6.0554, + "step": 352, + "task_loss": 2.546757936477661 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 6.688474655151367, + "epoch": 0.3, + "learning_rate": 1.4919695688926458e-05, + "loss": 5.3282, + "step": 353, + "task_loss": 1.8218973875045776 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 6.267772674560547, + "epoch": 0.3, + "learning_rate": 1.4961961115807271e-05, + "loss": 6.354, + "step": 354, + "task_loss": 1.9233524799346924 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 7.722354412078857, + "epoch": 0.3, + "learning_rate": 1.5004226542688082e-05, + "loss": 5.5794, + "step": 355, + "task_loss": 2.558985710144043 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 6.218945503234863, + "epoch": 0.3, + "learning_rate": 1.5046491969568893e-05, + "loss": 4.9183, + "step": 356, + "task_loss": 2.263394832611084 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 5.910632133483887, + "epoch": 0.3, + "learning_rate": 1.5088757396449705e-05, + "loss": 5.2828, + "step": 357, + "task_loss": 2.0332255363464355 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 4.502803325653076, + "epoch": 0.3, + "learning_rate": 1.5131022823330516e-05, + "loss": 5.3063, + "step": 358, + "task_loss": 2.33392071723938 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 7.088931083679199, + "epoch": 0.3, + "learning_rate": 1.5173288250211328e-05, + "loss": 6.3923, + "step": 359, + "task_loss": 1.8620336055755615 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 5.987544059753418, + "epoch": 0.3, + "learning_rate": 1.521555367709214e-05, + "loss": 5.3951, + "step": 360, + "task_loss": 2.5419938564300537 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 4.578038215637207, + "epoch": 0.3, + "learning_rate": 1.525781910397295e-05, + "loss": 4.9977, + "step": 361, + "task_loss": 1.972806692123413 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 5.173656463623047, + "epoch": 0.31, + "learning_rate": 1.530008453085376e-05, + "loss": 5.0946, + "step": 362, + "task_loss": 1.5557239055633545 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 7.562951564788818, + "epoch": 0.31, + "learning_rate": 1.534234995773457e-05, + "loss": 5.8932, + "step": 363, + "task_loss": 2.416274309158325 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 4.875923156738281, + "epoch": 0.31, + "learning_rate": 1.5384615384615387e-05, + "loss": 4.9647, + "step": 364, + "task_loss": 2.394611358642578 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 5.773307800292969, + "epoch": 0.31, + "learning_rate": 1.5426880811496197e-05, + "loss": 5.5396, + "step": 365, + "task_loss": 1.7742772102355957 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 5.911276340484619, + "epoch": 0.31, + "learning_rate": 1.546914623837701e-05, + "loss": 5.5433, + "step": 366, + "task_loss": 1.9132928848266602 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 4.448709487915039, + "epoch": 0.31, + "learning_rate": 1.551141166525782e-05, + "loss": 5.0675, + "step": 367, + "task_loss": 2.2840864658355713 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 5.060549736022949, + "epoch": 0.31, + "learning_rate": 1.555367709213863e-05, + "loss": 4.6987, + "step": 368, + "task_loss": 2.443235397338867 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 5.242588043212891, + "epoch": 0.31, + "learning_rate": 1.5595942519019444e-05, + "loss": 4.6777, + "step": 369, + "task_loss": 1.955786108970642 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 6.5537238121032715, + "epoch": 0.31, + "learning_rate": 1.5638207945900254e-05, + "loss": 5.8653, + "step": 370, + "task_loss": 2.4549827575683594 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 5.0424604415893555, + "epoch": 0.31, + "learning_rate": 1.5680473372781066e-05, + "loss": 5.1823, + "step": 371, + "task_loss": 1.7950185537338257 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 4.466312885284424, + "epoch": 0.31, + "learning_rate": 1.5722738799661876e-05, + "loss": 5.4229, + "step": 372, + "task_loss": 1.4839543104171753 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 5.001593112945557, + "epoch": 0.32, + "learning_rate": 1.576500422654269e-05, + "loss": 5.1429, + "step": 373, + "task_loss": 1.7042378187179565 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 7.290884017944336, + "epoch": 0.32, + "learning_rate": 1.58072696534235e-05, + "loss": 5.336, + "step": 374, + "task_loss": 1.8394978046417236 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 5.908597946166992, + "epoch": 0.32, + "learning_rate": 1.584953508030431e-05, + "loss": 5.2446, + "step": 375, + "task_loss": 1.6273515224456787 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 3.360844135284424, + "epoch": 0.32, + "learning_rate": 1.5891800507185124e-05, + "loss": 4.5445, + "step": 376, + "task_loss": 1.5615137815475464 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 6.3032426834106445, + "epoch": 0.32, + "learning_rate": 1.5934065934065933e-05, + "loss": 5.3883, + "step": 377, + "task_loss": 1.7372429370880127 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 5.513960838317871, + "epoch": 0.32, + "learning_rate": 1.5976331360946746e-05, + "loss": 4.8746, + "step": 378, + "task_loss": 2.129615306854248 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 5.728913307189941, + "epoch": 0.32, + "learning_rate": 1.601859678782756e-05, + "loss": 5.2556, + "step": 379, + "task_loss": 1.3387094736099243 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 5.840574741363525, + "epoch": 0.32, + "learning_rate": 1.6060862214708368e-05, + "loss": 4.8045, + "step": 380, + "task_loss": 1.8836053609848022 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 5.458423614501953, + "epoch": 0.32, + "learning_rate": 1.610312764158918e-05, + "loss": 4.961, + "step": 381, + "task_loss": 1.8382712602615356 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 5.191441059112549, + "epoch": 0.32, + "learning_rate": 1.614539306846999e-05, + "loss": 4.7982, + "step": 382, + "task_loss": 1.897676706314087 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 4.75653076171875, + "epoch": 0.32, + "learning_rate": 1.6187658495350806e-05, + "loss": 5.0632, + "step": 383, + "task_loss": 2.3354899883270264 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 4.828614234924316, + "epoch": 0.32, + "learning_rate": 1.6229923922231616e-05, + "loss": 4.7467, + "step": 384, + "task_loss": 1.2330222129821777 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 5.0315423011779785, + "epoch": 0.33, + "learning_rate": 1.6272189349112425e-05, + "loss": 5.0099, + "step": 385, + "task_loss": 1.9350425004959106 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 5.52721643447876, + "epoch": 0.33, + "learning_rate": 1.6314454775993238e-05, + "loss": 4.8158, + "step": 386, + "task_loss": 1.4878164529800415 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 6.129389762878418, + "epoch": 0.33, + "learning_rate": 1.6356720202874047e-05, + "loss": 4.7028, + "step": 387, + "task_loss": 1.8630125522613525 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 4.083349227905273, + "epoch": 0.33, + "learning_rate": 1.6398985629754863e-05, + "loss": 4.1317, + "step": 388, + "task_loss": 1.6951853036880493 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 4.8746232986450195, + "epoch": 0.33, + "learning_rate": 1.6441251056635673e-05, + "loss": 4.896, + "step": 389, + "task_loss": 2.5312702655792236 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 4.8668646812438965, + "epoch": 0.33, + "learning_rate": 1.6483516483516486e-05, + "loss": 4.7253, + "step": 390, + "task_loss": 1.8513497114181519 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 4.4477410316467285, + "epoch": 0.33, + "learning_rate": 1.6525781910397295e-05, + "loss": 4.6591, + "step": 391, + "task_loss": 1.965864896774292 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 4.611985206604004, + "epoch": 0.33, + "learning_rate": 1.6568047337278108e-05, + "loss": 4.7619, + "step": 392, + "task_loss": 1.4204081296920776 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 4.54319953918457, + "epoch": 0.33, + "learning_rate": 1.661031276415892e-05, + "loss": 4.4114, + "step": 393, + "task_loss": 2.0045440196990967 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 5.020666122436523, + "epoch": 0.33, + "learning_rate": 1.665257819103973e-05, + "loss": 4.181, + "step": 394, + "task_loss": 2.044067859649658 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 5.768197059631348, + "epoch": 0.33, + "learning_rate": 1.6694843617920543e-05, + "loss": 5.1337, + "step": 395, + "task_loss": 1.7999083995819092 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 3.781773090362549, + "epoch": 0.33, + "learning_rate": 1.6737109044801352e-05, + "loss": 4.3299, + "step": 396, + "task_loss": 1.8045899868011475 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 5.032170295715332, + "epoch": 0.34, + "learning_rate": 1.6779374471682165e-05, + "loss": 4.7602, + "step": 397, + "task_loss": 1.921103596687317 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 4.617743492126465, + "epoch": 0.34, + "learning_rate": 1.6821639898562978e-05, + "loss": 4.3861, + "step": 398, + "task_loss": 1.8125417232513428 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 4.445462226867676, + "epoch": 0.34, + "learning_rate": 1.6863905325443787e-05, + "loss": 4.8476, + "step": 399, + "task_loss": 1.3715507984161377 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 4.970694541931152, + "epoch": 0.34, + "learning_rate": 1.69061707523246e-05, + "loss": 4.6139, + "step": 400, + "task_loss": 1.889340877532959 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 4.383505821228027, + "epoch": 0.34, + "learning_rate": 1.694843617920541e-05, + "loss": 4.0995, + "step": 401, + "task_loss": 2.118938446044922 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 6.329733371734619, + "epoch": 0.34, + "learning_rate": 1.6990701606086222e-05, + "loss": 4.5507, + "step": 402, + "task_loss": 2.2250239849090576 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 4.189744472503662, + "epoch": 0.34, + "learning_rate": 1.7032967032967035e-05, + "loss": 3.9479, + "step": 403, + "task_loss": 1.1580610275268555 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 3.4113504886627197, + "epoch": 0.34, + "learning_rate": 1.7075232459847844e-05, + "loss": 4.067, + "step": 404, + "task_loss": 1.7253646850585938 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 4.405399322509766, + "epoch": 0.34, + "learning_rate": 1.7117497886728657e-05, + "loss": 4.5987, + "step": 405, + "task_loss": 1.6780617237091064 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 3.539499282836914, + "epoch": 0.34, + "learning_rate": 1.7159763313609466e-05, + "loss": 3.9092, + "step": 406, + "task_loss": 0.9126412272453308 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 3.492049217224121, + "epoch": 0.34, + "learning_rate": 1.7202028740490282e-05, + "loss": 3.8372, + "step": 407, + "task_loss": 1.1993082761764526 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 4.659815788269043, + "epoch": 0.34, + "learning_rate": 1.7244294167371092e-05, + "loss": 4.0019, + "step": 408, + "task_loss": 1.6589818000793457 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 4.182251930236816, + "epoch": 0.35, + "learning_rate": 1.72865595942519e-05, + "loss": 4.2753, + "step": 409, + "task_loss": 1.3572207689285278 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 5.373161315917969, + "epoch": 0.35, + "learning_rate": 1.7328825021132714e-05, + "loss": 4.525, + "step": 410, + "task_loss": 1.2126350402832031 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 4.914691925048828, + "epoch": 0.35, + "learning_rate": 1.7371090448013523e-05, + "loss": 4.4747, + "step": 411, + "task_loss": 1.7921062707901 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 4.296452045440674, + "epoch": 0.35, + "learning_rate": 1.741335587489434e-05, + "loss": 4.2379, + "step": 412, + "task_loss": 0.5651588439941406 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 4.935454845428467, + "epoch": 0.35, + "learning_rate": 1.745562130177515e-05, + "loss": 3.808, + "step": 413, + "task_loss": 1.6267709732055664 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 4.497427463531494, + "epoch": 0.35, + "learning_rate": 1.749788672865596e-05, + "loss": 4.0697, + "step": 414, + "task_loss": 1.7770404815673828 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 4.01531982421875, + "epoch": 0.35, + "learning_rate": 1.754015215553677e-05, + "loss": 4.0516, + "step": 415, + "task_loss": 1.6030842065811157 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 4.800504207611084, + "epoch": 0.35, + "learning_rate": 1.7582417582417584e-05, + "loss": 3.7625, + "step": 416, + "task_loss": 1.446210503578186 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 4.602176666259766, + "epoch": 0.35, + "learning_rate": 1.7624683009298397e-05, + "loss": 4.5358, + "step": 417, + "task_loss": 1.539741039276123 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 4.226249694824219, + "epoch": 0.35, + "learning_rate": 1.7666948436179206e-05, + "loss": 3.9055, + "step": 418, + "task_loss": 1.6847968101501465 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 3.887887477874756, + "epoch": 0.35, + "learning_rate": 1.770921386306002e-05, + "loss": 3.7248, + "step": 419, + "task_loss": 1.6622414588928223 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 3.5326428413391113, + "epoch": 0.35, + "learning_rate": 1.7751479289940828e-05, + "loss": 3.8524, + "step": 420, + "task_loss": 2.254150390625 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 3.487025022506714, + "epoch": 0.36, + "learning_rate": 1.779374471682164e-05, + "loss": 3.5779, + "step": 421, + "task_loss": 1.3388928174972534 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 3.9131360054016113, + "epoch": 0.36, + "learning_rate": 1.7836010143702454e-05, + "loss": 3.989, + "step": 422, + "task_loss": 1.0892082452774048 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 4.182522773742676, + "epoch": 0.36, + "learning_rate": 1.7878275570583263e-05, + "loss": 3.3552, + "step": 423, + "task_loss": 1.460448980331421 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 4.647093296051025, + "epoch": 0.36, + "learning_rate": 1.7920540997464076e-05, + "loss": 4.3733, + "step": 424, + "task_loss": 1.912064552307129 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 5.477087497711182, + "epoch": 0.36, + "learning_rate": 1.7962806424344885e-05, + "loss": 3.876, + "step": 425, + "task_loss": 1.1669418811798096 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 3.6018176078796387, + "epoch": 0.36, + "learning_rate": 1.8005071851225698e-05, + "loss": 4.1107, + "step": 426, + "task_loss": 2.482456922531128 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 3.34474515914917, + "epoch": 0.36, + "learning_rate": 1.804733727810651e-05, + "loss": 3.5892, + "step": 427, + "task_loss": 2.2219133377075195 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 4.018124103546143, + "epoch": 0.36, + "learning_rate": 1.808960270498732e-05, + "loss": 3.6816, + "step": 428, + "task_loss": 2.018838882446289 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 3.1224348545074463, + "epoch": 0.36, + "learning_rate": 1.8131868131868133e-05, + "loss": 3.7753, + "step": 429, + "task_loss": 1.4040857553482056 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 4.59801721572876, + "epoch": 0.36, + "learning_rate": 1.8174133558748942e-05, + "loss": 3.811, + "step": 430, + "task_loss": 2.4234955310821533 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 3.734684944152832, + "epoch": 0.36, + "learning_rate": 1.8216398985629755e-05, + "loss": 3.7392, + "step": 431, + "task_loss": 2.027235984802246 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 4.421584606170654, + "epoch": 0.36, + "learning_rate": 1.8258664412510568e-05, + "loss": 4.1532, + "step": 432, + "task_loss": 1.4803043603897095 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 3.6887450218200684, + "epoch": 0.37, + "learning_rate": 1.8300929839391377e-05, + "loss": 3.3957, + "step": 433, + "task_loss": 1.236428141593933 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 4.0300445556640625, + "epoch": 0.37, + "learning_rate": 1.834319526627219e-05, + "loss": 3.3946, + "step": 434, + "task_loss": 1.5627824068069458 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 3.120479106903076, + "epoch": 0.37, + "learning_rate": 1.8385460693153e-05, + "loss": 3.5054, + "step": 435, + "task_loss": 0.7140458226203918 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 3.7074668407440186, + "epoch": 0.37, + "learning_rate": 1.8427726120033816e-05, + "loss": 3.2998, + "step": 436, + "task_loss": 1.4515857696533203 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 3.678971290588379, + "epoch": 0.37, + "learning_rate": 1.8469991546914625e-05, + "loss": 3.5096, + "step": 437, + "task_loss": 2.129225969314575 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 2.707834005355835, + "epoch": 0.37, + "learning_rate": 1.8512256973795435e-05, + "loss": 3.2218, + "step": 438, + "task_loss": 1.6021143198013306 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 3.78014874458313, + "epoch": 0.37, + "learning_rate": 1.8554522400676247e-05, + "loss": 3.3855, + "step": 439, + "task_loss": 1.9996271133422852 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 3.389688014984131, + "epoch": 0.37, + "learning_rate": 1.8596787827557057e-05, + "loss": 3.2295, + "step": 440, + "task_loss": 1.7247347831726074 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 4.061910629272461, + "epoch": 0.37, + "learning_rate": 1.8639053254437873e-05, + "loss": 3.1201, + "step": 441, + "task_loss": 1.2800053358078003 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 2.8365182876586914, + "epoch": 0.37, + "learning_rate": 1.8681318681318682e-05, + "loss": 3.2807, + "step": 442, + "task_loss": 1.5826764106750488 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 3.844360589981079, + "epoch": 0.37, + "learning_rate": 1.8723584108199495e-05, + "loss": 3.6967, + "step": 443, + "task_loss": 2.011652946472168 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 3.9784743785858154, + "epoch": 0.38, + "learning_rate": 1.8765849535080304e-05, + "loss": 3.3886, + "step": 444, + "task_loss": 1.5634260177612305 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 2.5377941131591797, + "epoch": 0.38, + "learning_rate": 1.8808114961961117e-05, + "loss": 3.3721, + "step": 445, + "task_loss": 1.0739930868148804 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 3.4861831665039062, + "epoch": 0.38, + "learning_rate": 1.885038038884193e-05, + "loss": 3.2106, + "step": 446, + "task_loss": 1.2480708360671997 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 3.4460606575012207, + "epoch": 0.38, + "learning_rate": 1.889264581572274e-05, + "loss": 3.1539, + "step": 447, + "task_loss": 1.3746886253356934 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 2.138279914855957, + "epoch": 0.38, + "learning_rate": 1.8934911242603552e-05, + "loss": 2.9943, + "step": 448, + "task_loss": 1.261868953704834 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 3.0119004249572754, + "epoch": 0.38, + "learning_rate": 1.897717666948436e-05, + "loss": 2.6361, + "step": 449, + "task_loss": 1.8958187103271484 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 3.7453014850616455, + "epoch": 0.38, + "learning_rate": 1.9019442096365174e-05, + "loss": 3.4186, + "step": 450, + "task_loss": 1.3763338327407837 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 2.986863613128662, + "epoch": 0.38, + "learning_rate": 1.9061707523245987e-05, + "loss": 3.0818, + "step": 451, + "task_loss": 1.090401291847229 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 3.598148822784424, + "epoch": 0.38, + "learning_rate": 1.9103972950126796e-05, + "loss": 3.0639, + "step": 452, + "task_loss": 1.4045134782791138 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 3.9013113975524902, + "epoch": 0.38, + "learning_rate": 1.914623837700761e-05, + "loss": 3.6508, + "step": 453, + "task_loss": 1.487033724784851 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 3.8980655670166016, + "epoch": 0.38, + "learning_rate": 1.918850380388842e-05, + "loss": 3.3144, + "step": 454, + "task_loss": 1.6256424188613892 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 3.8106842041015625, + "epoch": 0.38, + "learning_rate": 1.923076923076923e-05, + "loss": 3.1061, + "step": 455, + "task_loss": 1.6683250665664673 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 3.250868797302246, + "epoch": 0.39, + "learning_rate": 1.9273034657650044e-05, + "loss": 2.9181, + "step": 456, + "task_loss": 0.7861428260803223 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 3.670706272125244, + "epoch": 0.39, + "learning_rate": 1.9315300084530854e-05, + "loss": 3.3494, + "step": 457, + "task_loss": 1.307903528213501 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 3.914956569671631, + "epoch": 0.39, + "learning_rate": 1.9357565511411666e-05, + "loss": 2.9449, + "step": 458, + "task_loss": 2.018502712249756 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 2.3373169898986816, + "epoch": 0.39, + "learning_rate": 1.9399830938292476e-05, + "loss": 2.5605, + "step": 459, + "task_loss": 1.1175545454025269 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 2.3223793506622314, + "epoch": 0.39, + "learning_rate": 1.944209636517329e-05, + "loss": 2.9028, + "step": 460, + "task_loss": 1.8081469535827637 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 3.100377082824707, + "epoch": 0.39, + "learning_rate": 1.94843617920541e-05, + "loss": 2.918, + "step": 461, + "task_loss": 1.6999719142913818 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 3.4682412147521973, + "epoch": 0.39, + "learning_rate": 1.952662721893491e-05, + "loss": 2.9514, + "step": 462, + "task_loss": 1.1028684377670288 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 3.052436351776123, + "epoch": 0.39, + "learning_rate": 1.9568892645815723e-05, + "loss": 3.1347, + "step": 463, + "task_loss": 1.2945197820663452 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 3.003455638885498, + "epoch": 0.39, + "learning_rate": 1.9611158072696533e-05, + "loss": 2.8966, + "step": 464, + "task_loss": 1.365273356437683 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 3.377332925796509, + "epoch": 0.39, + "learning_rate": 1.965342349957735e-05, + "loss": 2.7946, + "step": 465, + "task_loss": 1.4409900903701782 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 3.902750015258789, + "epoch": 0.39, + "learning_rate": 1.969568892645816e-05, + "loss": 3.2007, + "step": 466, + "task_loss": 1.468926191329956 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 3.440169095993042, + "epoch": 0.39, + "learning_rate": 1.9737954353338968e-05, + "loss": 2.9493, + "step": 467, + "task_loss": 0.9817131757736206 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 4.182075023651123, + "epoch": 0.4, + "learning_rate": 1.978021978021978e-05, + "loss": 3.2872, + "step": 468, + "task_loss": 1.5237255096435547 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 3.4946439266204834, + "epoch": 0.4, + "learning_rate": 1.9822485207100593e-05, + "loss": 2.9127, + "step": 469, + "task_loss": 0.9216484427452087 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 3.713230609893799, + "epoch": 0.4, + "learning_rate": 1.9864750633981406e-05, + "loss": 3.0136, + "step": 470, + "task_loss": 2.355800151824951 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 2.5385334491729736, + "epoch": 0.4, + "learning_rate": 1.9907016060862216e-05, + "loss": 3.1428, + "step": 471, + "task_loss": 1.546563982963562 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 2.2123990058898926, + "epoch": 0.4, + "learning_rate": 1.994928148774303e-05, + "loss": 2.6846, + "step": 472, + "task_loss": 1.3410879373550415 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 3.0110840797424316, + "epoch": 0.4, + "learning_rate": 1.9991546914623838e-05, + "loss": 3.226, + "step": 473, + "task_loss": 1.1221226453781128 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 2.2938060760498047, + "epoch": 0.4, + "learning_rate": 2.003381234150465e-05, + "loss": 2.4607, + "step": 474, + "task_loss": 1.049346923828125 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 2.9862170219421387, + "epoch": 0.4, + "learning_rate": 2.0076077768385463e-05, + "loss": 3.0316, + "step": 475, + "task_loss": 0.8953298926353455 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 3.3019683361053467, + "epoch": 0.4, + "learning_rate": 2.0118343195266273e-05, + "loss": 3.089, + "step": 476, + "task_loss": 1.886934757232666 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 3.573516845703125, + "epoch": 0.4, + "learning_rate": 2.0160608622147085e-05, + "loss": 3.0743, + "step": 477, + "task_loss": 1.2565950155258179 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 2.333932638168335, + "epoch": 0.4, + "learning_rate": 2.0202874049027895e-05, + "loss": 2.9761, + "step": 478, + "task_loss": 2.347757577896118 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 2.129934310913086, + "epoch": 0.4, + "learning_rate": 2.0245139475908708e-05, + "loss": 2.3854, + "step": 479, + "task_loss": 1.7285182476043701 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 3.578687906265259, + "epoch": 0.41, + "learning_rate": 2.028740490278952e-05, + "loss": 2.871, + "step": 480, + "task_loss": 1.4707187414169312 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 3.1081323623657227, + "epoch": 0.41, + "learning_rate": 2.032967032967033e-05, + "loss": 3.203, + "step": 481, + "task_loss": 2.3341994285583496 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 2.69724702835083, + "epoch": 0.41, + "learning_rate": 2.0371935756551143e-05, + "loss": 2.5809, + "step": 482, + "task_loss": 1.1442439556121826 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 3.2646069526672363, + "epoch": 0.41, + "learning_rate": 2.0414201183431952e-05, + "loss": 2.828, + "step": 483, + "task_loss": 1.5372252464294434 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 2.8411061763763428, + "epoch": 0.41, + "learning_rate": 2.0456466610312765e-05, + "loss": 2.7103, + "step": 484, + "task_loss": 1.1392698287963867 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 2.7103962898254395, + "epoch": 0.41, + "learning_rate": 2.0498732037193578e-05, + "loss": 3.2071, + "step": 485, + "task_loss": 1.714789867401123 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.888676643371582, + "epoch": 0.41, + "learning_rate": 2.0540997464074387e-05, + "loss": 2.3974, + "step": 486, + "task_loss": 1.2281129360198975 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 2.6795501708984375, + "epoch": 0.41, + "learning_rate": 2.05832628909552e-05, + "loss": 2.5494, + "step": 487, + "task_loss": 1.2531245946884155 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 3.906757354736328, + "epoch": 0.41, + "learning_rate": 2.062552831783601e-05, + "loss": 2.8051, + "step": 488, + "task_loss": 0.8815559148788452 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.9760488271713257, + "epoch": 0.41, + "learning_rate": 2.0667793744716822e-05, + "loss": 2.7453, + "step": 489, + "task_loss": 0.9677852392196655 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 2.6790108680725098, + "epoch": 0.41, + "learning_rate": 2.0710059171597635e-05, + "loss": 2.5883, + "step": 490, + "task_loss": 0.6970438957214355 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 2.4974966049194336, + "epoch": 0.41, + "learning_rate": 2.0752324598478444e-05, + "loss": 2.8075, + "step": 491, + "task_loss": 1.277848720550537 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 2.780791997909546, + "epoch": 0.42, + "learning_rate": 2.0794590025359257e-05, + "loss": 2.772, + "step": 492, + "task_loss": 1.3234076499938965 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 3.041388511657715, + "epoch": 0.42, + "learning_rate": 2.083685545224007e-05, + "loss": 2.5044, + "step": 493, + "task_loss": 1.5247846841812134 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 3.6011877059936523, + "epoch": 0.42, + "learning_rate": 2.0879120879120882e-05, + "loss": 2.9537, + "step": 494, + "task_loss": 1.1916651725769043 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 2.6453027725219727, + "epoch": 0.42, + "learning_rate": 2.0921386306001692e-05, + "loss": 2.433, + "step": 495, + "task_loss": 1.9161524772644043 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 3.4757633209228516, + "epoch": 0.42, + "learning_rate": 2.09636517328825e-05, + "loss": 2.5843, + "step": 496, + "task_loss": 1.6245217323303223 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 3.0797581672668457, + "epoch": 0.42, + "learning_rate": 2.1005917159763314e-05, + "loss": 2.9195, + "step": 497, + "task_loss": 1.1057181358337402 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 3.042912483215332, + "epoch": 0.42, + "learning_rate": 2.1048182586644127e-05, + "loss": 2.7021, + "step": 498, + "task_loss": 0.8674214482307434 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.8304779529571533, + "epoch": 0.42, + "learning_rate": 2.109044801352494e-05, + "loss": 2.0921, + "step": 499, + "task_loss": 0.7738440632820129 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 2.1748499870300293, + "epoch": 0.42, + "learning_rate": 2.113271344040575e-05, + "loss": 2.2676, + "step": 500, + "task_loss": 0.4352116286754608 + }, + { + "epoch": 0.42, + "eval_accuracy": 0.7947326732673268, + "eval_loss": 2.108656167984009, + "eval_runtime": 226.177, + "eval_samples_per_second": 111.638, + "eval_steps_per_second": 0.875, + "step": 500 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 2.1833982467651367, + "epoch": 0.42, + "learning_rate": 2.117497886728656e-05, + "loss": 2.7708, + "step": 501, + "task_loss": 1.7501763105392456 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 2.2712764739990234, + "epoch": 0.42, + "learning_rate": 2.121724429416737e-05, + "loss": 2.39, + "step": 502, + "task_loss": 0.7685309648513794 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 2.638744592666626, + "epoch": 0.42, + "learning_rate": 2.1259509721048184e-05, + "loss": 2.5966, + "step": 503, + "task_loss": 0.982324481010437 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 2.0377399921417236, + "epoch": 0.43, + "learning_rate": 2.1301775147928997e-05, + "loss": 2.4878, + "step": 504, + "task_loss": 1.7711303234100342 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 2.876831531524658, + "epoch": 0.43, + "learning_rate": 2.1344040574809806e-05, + "loss": 2.2551, + "step": 505, + "task_loss": 1.4975056648254395 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.882049798965454, + "epoch": 0.43, + "learning_rate": 2.138630600169062e-05, + "loss": 1.8603, + "step": 506, + "task_loss": 1.1373587846755981 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.83182692527771, + "epoch": 0.43, + "learning_rate": 2.1428571428571428e-05, + "loss": 2.2318, + "step": 507, + "task_loss": 0.48504412174224854 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 2.381467819213867, + "epoch": 0.43, + "learning_rate": 2.147083685545224e-05, + "loss": 2.3466, + "step": 508, + "task_loss": 0.5970094203948975 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 2.466721296310425, + "epoch": 0.43, + "learning_rate": 2.1513102282333054e-05, + "loss": 2.428, + "step": 509, + "task_loss": 1.0185757875442505 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 2.970189332962036, + "epoch": 0.43, + "learning_rate": 2.1555367709213863e-05, + "loss": 2.5189, + "step": 510, + "task_loss": 1.4654353857040405 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.7572011947631836, + "epoch": 0.43, + "learning_rate": 2.1597633136094676e-05, + "loss": 2.225, + "step": 511, + "task_loss": 1.4214348793029785 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 2.691232442855835, + "epoch": 0.43, + "learning_rate": 2.1639898562975485e-05, + "loss": 2.4251, + "step": 512, + "task_loss": 1.1146931648254395 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 3.0291330814361572, + "epoch": 0.43, + "learning_rate": 2.1682163989856298e-05, + "loss": 2.7257, + "step": 513, + "task_loss": 0.8977842926979065 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 2.302755832672119, + "epoch": 0.43, + "learning_rate": 2.172442941673711e-05, + "loss": 2.2075, + "step": 514, + "task_loss": 1.5624113082885742 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 2.222282648086548, + "epoch": 0.44, + "learning_rate": 2.176669484361792e-05, + "loss": 2.5413, + "step": 515, + "task_loss": 1.5128321647644043 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 2.756943941116333, + "epoch": 0.44, + "learning_rate": 2.1808960270498733e-05, + "loss": 2.1, + "step": 516, + "task_loss": 1.3281652927398682 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 2.5485689640045166, + "epoch": 0.44, + "learning_rate": 2.1851225697379546e-05, + "loss": 2.3332, + "step": 517, + "task_loss": 2.0654587745666504 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 2.6797261238098145, + "epoch": 0.44, + "learning_rate": 2.189349112426036e-05, + "loss": 2.0501, + "step": 518, + "task_loss": 1.215960144996643 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 2.395606517791748, + "epoch": 0.44, + "learning_rate": 2.1935756551141168e-05, + "loss": 2.1123, + "step": 519, + "task_loss": 1.6138554811477661 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 2.91221284866333, + "epoch": 0.44, + "learning_rate": 2.1978021978021977e-05, + "loss": 2.4037, + "step": 520, + "task_loss": 0.9587717652320862 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 3.0148673057556152, + "epoch": 0.44, + "learning_rate": 2.202028740490279e-05, + "loss": 2.6213, + "step": 521, + "task_loss": 1.174586296081543 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 2.454350471496582, + "epoch": 0.44, + "learning_rate": 2.2062552831783603e-05, + "loss": 2.4022, + "step": 522, + "task_loss": 0.8328395485877991 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 2.1828932762145996, + "epoch": 0.44, + "learning_rate": 2.2104818258664416e-05, + "loss": 2.0248, + "step": 523, + "task_loss": 1.0991419553756714 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 2.8340721130371094, + "epoch": 0.44, + "learning_rate": 2.2147083685545225e-05, + "loss": 2.082, + "step": 524, + "task_loss": 0.8688100576400757 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 2.5442023277282715, + "epoch": 0.44, + "learning_rate": 2.2189349112426034e-05, + "loss": 2.2399, + "step": 525, + "task_loss": 1.4538289308547974 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 2.400397539138794, + "epoch": 0.44, + "learning_rate": 2.2231614539306847e-05, + "loss": 2.3214, + "step": 526, + "task_loss": 1.0998371839523315 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 2.188443422317505, + "epoch": 0.45, + "learning_rate": 2.227387996618766e-05, + "loss": 2.0528, + "step": 527, + "task_loss": 1.5736502408981323 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 2.7074997425079346, + "epoch": 0.45, + "learning_rate": 2.2316145393068473e-05, + "loss": 2.27, + "step": 528, + "task_loss": 1.7446556091308594 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 2.664313793182373, + "epoch": 0.45, + "learning_rate": 2.2358410819949282e-05, + "loss": 2.2817, + "step": 529, + "task_loss": 1.2587699890136719 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 2.640709638595581, + "epoch": 0.45, + "learning_rate": 2.2400676246830095e-05, + "loss": 2.2473, + "step": 530, + "task_loss": 1.3915153741836548 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 2.774782657623291, + "epoch": 0.45, + "learning_rate": 2.2442941673710904e-05, + "loss": 2.2367, + "step": 531, + "task_loss": 1.3553184270858765 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 2.0635666847229004, + "epoch": 0.45, + "learning_rate": 2.2485207100591717e-05, + "loss": 2.0778, + "step": 532, + "task_loss": 1.5550239086151123 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 2.6943728923797607, + "epoch": 0.45, + "learning_rate": 2.252747252747253e-05, + "loss": 2.3212, + "step": 533, + "task_loss": 1.2910689115524292 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.7708451747894287, + "epoch": 0.45, + "learning_rate": 2.256973795435334e-05, + "loss": 2.1486, + "step": 534, + "task_loss": 1.4383989572525024 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 2.9864957332611084, + "epoch": 0.45, + "learning_rate": 2.2612003381234152e-05, + "loss": 2.4786, + "step": 535, + "task_loss": 1.2517355680465698 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 3.1722757816314697, + "epoch": 0.45, + "learning_rate": 2.265426880811496e-05, + "loss": 2.1645, + "step": 536, + "task_loss": 1.5140818357467651 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.6191139221191406, + "epoch": 0.45, + "learning_rate": 2.2696534234995774e-05, + "loss": 2.1956, + "step": 537, + "task_loss": 1.139264702796936 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 3.1866304874420166, + "epoch": 0.45, + "learning_rate": 2.2738799661876587e-05, + "loss": 2.6427, + "step": 538, + "task_loss": 1.6369730234146118 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 2.068495035171509, + "epoch": 0.46, + "learning_rate": 2.2781065088757396e-05, + "loss": 1.8554, + "step": 539, + "task_loss": 1.1220762729644775 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 2.272372245788574, + "epoch": 0.46, + "learning_rate": 2.282333051563821e-05, + "loss": 2.1855, + "step": 540, + "task_loss": 0.9333959817886353 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.8025707006454468, + "epoch": 0.46, + "learning_rate": 2.286559594251902e-05, + "loss": 2.2615, + "step": 541, + "task_loss": 1.338422417640686 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.5329136848449707, + "epoch": 0.46, + "learning_rate": 2.290786136939983e-05, + "loss": 1.9944, + "step": 542, + "task_loss": 0.9137334823608398 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.7770957946777344, + "epoch": 0.46, + "learning_rate": 2.2950126796280644e-05, + "loss": 1.8463, + "step": 543, + "task_loss": 0.99053555727005 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 2.054398536682129, + "epoch": 0.46, + "learning_rate": 2.2992392223161454e-05, + "loss": 2.2633, + "step": 544, + "task_loss": 0.9043038487434387 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 2.6008200645446777, + "epoch": 0.46, + "learning_rate": 2.3034657650042266e-05, + "loss": 2.0889, + "step": 545, + "task_loss": 1.2475757598876953 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 2.1617512702941895, + "epoch": 0.46, + "learning_rate": 2.307692307692308e-05, + "loss": 2.1881, + "step": 546, + "task_loss": 0.4580702483654022 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 2.823267936706543, + "epoch": 0.46, + "learning_rate": 2.3119188503803892e-05, + "loss": 2.0982, + "step": 547, + "task_loss": 1.370139241218567 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 2.469528913497925, + "epoch": 0.46, + "learning_rate": 2.31614539306847e-05, + "loss": 2.1276, + "step": 548, + "task_loss": 1.1134635210037231 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.733099341392517, + "epoch": 0.46, + "learning_rate": 2.320371935756551e-05, + "loss": 1.9433, + "step": 549, + "task_loss": 1.7295442819595337 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 2.6101553440093994, + "epoch": 0.46, + "learning_rate": 2.3245984784446323e-05, + "loss": 2.241, + "step": 550, + "task_loss": 1.6053123474121094 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.630253791809082, + "epoch": 0.47, + "learning_rate": 2.3288250211327136e-05, + "loss": 1.5826, + "step": 551, + "task_loss": 1.0710700750350952 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.6688848733901978, + "epoch": 0.47, + "learning_rate": 2.333051563820795e-05, + "loss": 1.9808, + "step": 552, + "task_loss": 1.5891927480697632 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.8271673917770386, + "epoch": 0.47, + "learning_rate": 2.337278106508876e-05, + "loss": 1.8334, + "step": 553, + "task_loss": 1.4577524662017822 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.8559801578521729, + "epoch": 0.47, + "learning_rate": 2.341504649196957e-05, + "loss": 1.8992, + "step": 554, + "task_loss": 0.7260839939117432 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 2.263404369354248, + "epoch": 0.47, + "learning_rate": 2.345731191885038e-05, + "loss": 2.0486, + "step": 555, + "task_loss": 1.0958497524261475 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 2.4724626541137695, + "epoch": 0.47, + "learning_rate": 2.3499577345731193e-05, + "loss": 1.913, + "step": 556, + "task_loss": 0.9496216773986816 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 2.7800204753875732, + "epoch": 0.47, + "learning_rate": 2.3541842772612006e-05, + "loss": 1.7483, + "step": 557, + "task_loss": 2.0007011890411377 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 2.151249885559082, + "epoch": 0.47, + "learning_rate": 2.3584108199492815e-05, + "loss": 1.8281, + "step": 558, + "task_loss": 0.9958018064498901 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 2.1911585330963135, + "epoch": 0.47, + "learning_rate": 2.3626373626373628e-05, + "loss": 2.2765, + "step": 559, + "task_loss": 0.5404365062713623 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.868894100189209, + "epoch": 0.47, + "learning_rate": 2.3668639053254438e-05, + "loss": 1.9481, + "step": 560, + "task_loss": 1.770961880683899 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.6021733283996582, + "epoch": 0.47, + "learning_rate": 2.371090448013525e-05, + "loss": 1.632, + "step": 561, + "task_loss": 0.7386936545372009 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 2.554785966873169, + "epoch": 0.47, + "learning_rate": 2.3753169907016063e-05, + "loss": 2.2806, + "step": 562, + "task_loss": 1.2048391103744507 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.723888874053955, + "epoch": 0.48, + "learning_rate": 2.3795435333896873e-05, + "loss": 1.9438, + "step": 563, + "task_loss": 1.2216720581054688 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 2.2916860580444336, + "epoch": 0.48, + "learning_rate": 2.3837700760777685e-05, + "loss": 1.6872, + "step": 564, + "task_loss": 1.7146090269088745 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 2.2390878200531006, + "epoch": 0.48, + "learning_rate": 2.3879966187658495e-05, + "loss": 1.9545, + "step": 565, + "task_loss": 1.2985213994979858 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.8307461738586426, + "epoch": 0.48, + "learning_rate": 2.3922231614539308e-05, + "loss": 1.8333, + "step": 566, + "task_loss": 0.6718560457229614 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 2.2520875930786133, + "epoch": 0.48, + "learning_rate": 2.396449704142012e-05, + "loss": 2.1959, + "step": 567, + "task_loss": 1.5362135171890259 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.6778745651245117, + "epoch": 0.48, + "learning_rate": 2.400676246830093e-05, + "loss": 1.879, + "step": 568, + "task_loss": 0.5857014656066895 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 2.2357287406921387, + "epoch": 0.48, + "learning_rate": 2.4049027895181742e-05, + "loss": 2.2874, + "step": 569, + "task_loss": 1.520495891571045 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 2.042743682861328, + "epoch": 0.48, + "learning_rate": 2.4091293322062555e-05, + "loss": 1.8701, + "step": 570, + "task_loss": 1.0886714458465576 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.3700690269470215, + "epoch": 0.48, + "learning_rate": 2.4133558748943365e-05, + "loss": 1.7621, + "step": 571, + "task_loss": 0.702406644821167 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.9486669301986694, + "epoch": 0.48, + "learning_rate": 2.4175824175824177e-05, + "loss": 1.544, + "step": 572, + "task_loss": 0.9646759033203125 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 2.1959726810455322, + "epoch": 0.48, + "learning_rate": 2.4218089602704987e-05, + "loss": 2.1131, + "step": 573, + "task_loss": 1.5761204957962036 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 2.38568115234375, + "epoch": 0.48, + "learning_rate": 2.42603550295858e-05, + "loss": 2.0043, + "step": 574, + "task_loss": 1.259171962738037 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 2.0574824810028076, + "epoch": 0.49, + "learning_rate": 2.4302620456466612e-05, + "loss": 1.8995, + "step": 575, + "task_loss": 0.9315642714500427 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 2.021869659423828, + "epoch": 0.49, + "learning_rate": 2.4344885883347425e-05, + "loss": 1.9897, + "step": 576, + "task_loss": 0.9874861836433411 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.2956435680389404, + "epoch": 0.49, + "learning_rate": 2.4387151310228235e-05, + "loss": 1.8924, + "step": 577, + "task_loss": 1.2937473058700562 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 2.6407103538513184, + "epoch": 0.49, + "learning_rate": 2.4429416737109044e-05, + "loss": 2.1338, + "step": 578, + "task_loss": 1.623302936553955 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.8056650161743164, + "epoch": 0.49, + "learning_rate": 2.4471682163989857e-05, + "loss": 1.8035, + "step": 579, + "task_loss": 1.652133584022522 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.7478270530700684, + "epoch": 0.49, + "learning_rate": 2.451394759087067e-05, + "loss": 2.0642, + "step": 580, + "task_loss": 1.1515332460403442 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 2.2086875438690186, + "epoch": 0.49, + "learning_rate": 2.4556213017751482e-05, + "loss": 1.7182, + "step": 581, + "task_loss": 0.7725767493247986 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.808950424194336, + "epoch": 0.49, + "learning_rate": 2.459847844463229e-05, + "loss": 1.8926, + "step": 582, + "task_loss": 1.5882987976074219 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.7286043167114258, + "epoch": 0.49, + "learning_rate": 2.4640743871513104e-05, + "loss": 1.9039, + "step": 583, + "task_loss": 0.6680277585983276 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 2.0558266639709473, + "epoch": 0.49, + "learning_rate": 2.4683009298393914e-05, + "loss": 2.1621, + "step": 584, + "task_loss": 1.1184676885604858 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.9074819087982178, + "epoch": 0.49, + "learning_rate": 2.4725274725274727e-05, + "loss": 1.9615, + "step": 585, + "task_loss": 1.5058660507202148 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.6721770763397217, + "epoch": 0.5, + "learning_rate": 2.476754015215554e-05, + "loss": 1.5138, + "step": 586, + "task_loss": 0.29819393157958984 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.4543733596801758, + "epoch": 0.5, + "learning_rate": 2.480980557903635e-05, + "loss": 1.6232, + "step": 587, + "task_loss": 1.0513654947280884 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.9924726486206055, + "epoch": 0.5, + "learning_rate": 2.485207100591716e-05, + "loss": 2.155, + "step": 588, + "task_loss": 2.361231565475464 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 2.6872291564941406, + "epoch": 0.5, + "learning_rate": 2.489433643279797e-05, + "loss": 2.0237, + "step": 589, + "task_loss": 1.0504076480865479 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.7158302068710327, + "epoch": 0.5, + "learning_rate": 2.4936601859678784e-05, + "loss": 1.7117, + "step": 590, + "task_loss": 1.2631242275238037 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.4364511966705322, + "epoch": 0.5, + "learning_rate": 2.4978867286559597e-05, + "loss": 1.4404, + "step": 591, + "task_loss": 0.7790091633796692 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 2.1071834564208984, + "epoch": 0.5, + "learning_rate": 2.502113271344041e-05, + "loss": 1.9508, + "step": 592, + "task_loss": 0.7317472100257874 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 2.250026226043701, + "epoch": 0.5, + "learning_rate": 2.506339814032122e-05, + "loss": 1.7792, + "step": 593, + "task_loss": 1.3049629926681519 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.3768805265426636, + "epoch": 0.5, + "learning_rate": 2.510566356720203e-05, + "loss": 1.4023, + "step": 594, + "task_loss": 0.7010613679885864 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.9267685413360596, + "epoch": 0.5, + "learning_rate": 2.514792899408284e-05, + "loss": 1.5329, + "step": 595, + "task_loss": 0.7476761937141418 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.7090365886688232, + "epoch": 0.5, + "learning_rate": 2.5190194420963654e-05, + "loss": 1.6372, + "step": 596, + "task_loss": 1.5144426822662354 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.5731834173202515, + "epoch": 0.5, + "learning_rate": 2.5232459847844463e-05, + "loss": 1.6874, + "step": 597, + "task_loss": 0.6311089992523193 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 2.3774452209472656, + "epoch": 0.51, + "learning_rate": 2.5274725274725276e-05, + "loss": 2.1591, + "step": 598, + "task_loss": 2.3305716514587402 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.4071381092071533, + "epoch": 0.51, + "learning_rate": 2.5316990701606085e-05, + "loss": 1.5448, + "step": 599, + "task_loss": 1.1393566131591797 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.2116893529891968, + "epoch": 0.51, + "learning_rate": 2.5359256128486898e-05, + "loss": 1.7468, + "step": 600, + "task_loss": 1.1680995225906372 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.7497591972351074, + "epoch": 0.51, + "learning_rate": 2.5401521555367707e-05, + "loss": 2.2182, + "step": 601, + "task_loss": 1.7593193054199219 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.7641654014587402, + "epoch": 0.51, + "learning_rate": 2.5443786982248524e-05, + "loss": 1.9017, + "step": 602, + "task_loss": 1.5303020477294922 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.8423734903335571, + "epoch": 0.51, + "learning_rate": 2.5486052409129336e-05, + "loss": 1.5398, + "step": 603, + "task_loss": 0.9125271439552307 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.8104195594787598, + "epoch": 0.51, + "learning_rate": 2.5528317836010146e-05, + "loss": 1.4813, + "step": 604, + "task_loss": 1.417677402496338 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 2.104424238204956, + "epoch": 0.51, + "learning_rate": 2.557058326289096e-05, + "loss": 1.5896, + "step": 605, + "task_loss": 1.5117765665054321 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.9082846641540527, + "epoch": 0.51, + "learning_rate": 2.5612848689771768e-05, + "loss": 1.4617, + "step": 606, + "task_loss": 0.7563328146934509 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 2.508881092071533, + "epoch": 0.51, + "learning_rate": 2.5655114116652577e-05, + "loss": 2.011, + "step": 607, + "task_loss": 1.1263551712036133 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.774326205253601, + "epoch": 0.51, + "learning_rate": 2.569737954353339e-05, + "loss": 1.7182, + "step": 608, + "task_loss": 1.0811508893966675 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.4354971647262573, + "epoch": 0.51, + "learning_rate": 2.57396449704142e-05, + "loss": 1.7321, + "step": 609, + "task_loss": 1.1322211027145386 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 2.1755032539367676, + "epoch": 0.52, + "learning_rate": 2.5781910397295012e-05, + "loss": 1.5819, + "step": 610, + "task_loss": 1.032667636871338 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.321526050567627, + "epoch": 0.52, + "learning_rate": 2.582417582417583e-05, + "loss": 1.3548, + "step": 611, + "task_loss": 1.0109859704971313 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.291067123413086, + "epoch": 0.52, + "learning_rate": 2.5866441251056638e-05, + "loss": 1.5062, + "step": 612, + "task_loss": 0.7999576926231384 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.4695544242858887, + "epoch": 0.52, + "learning_rate": 2.590870667793745e-05, + "loss": 1.7557, + "step": 613, + "task_loss": 1.4489662647247314 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.4045896530151367, + "epoch": 0.52, + "learning_rate": 2.595097210481826e-05, + "loss": 1.8991, + "step": 614, + "task_loss": 0.850382387638092 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.591586947441101, + "epoch": 0.52, + "learning_rate": 2.5993237531699073e-05, + "loss": 1.7515, + "step": 615, + "task_loss": 2.0645599365234375 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.9465082883834839, + "epoch": 0.52, + "learning_rate": 2.6035502958579882e-05, + "loss": 1.8586, + "step": 616, + "task_loss": 1.6432621479034424 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.8321236371994019, + "epoch": 0.52, + "learning_rate": 2.6077768385460695e-05, + "loss": 1.6317, + "step": 617, + "task_loss": 2.2543234825134277 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.9009987115859985, + "epoch": 0.52, + "learning_rate": 2.6120033812341504e-05, + "loss": 1.4536, + "step": 618, + "task_loss": 1.0427546501159668 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.8423457145690918, + "epoch": 0.52, + "learning_rate": 2.6162299239222317e-05, + "loss": 1.6323, + "step": 619, + "task_loss": 0.9704166650772095 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.8022714853286743, + "epoch": 0.52, + "learning_rate": 2.6204564666103126e-05, + "loss": 1.5321, + "step": 620, + "task_loss": 1.181983232498169 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.6737985610961914, + "epoch": 0.52, + "learning_rate": 2.6246830092983943e-05, + "loss": 1.7455, + "step": 621, + "task_loss": 1.2462942600250244 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.998228907585144, + "epoch": 0.53, + "learning_rate": 2.6289095519864755e-05, + "loss": 1.5497, + "step": 622, + "task_loss": 1.9189759492874146 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.310429573059082, + "epoch": 0.53, + "learning_rate": 2.6331360946745565e-05, + "loss": 1.3369, + "step": 623, + "task_loss": 0.6672812700271606 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 2.1305723190307617, + "epoch": 0.53, + "learning_rate": 2.6373626373626374e-05, + "loss": 1.6624, + "step": 624, + "task_loss": 1.0692880153656006 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.6739581823349, + "epoch": 0.53, + "learning_rate": 2.6415891800507187e-05, + "loss": 1.8958, + "step": 625, + "task_loss": 1.1022433042526245 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 2.4233789443969727, + "epoch": 0.53, + "learning_rate": 2.6458157227387996e-05, + "loss": 1.8391, + "step": 626, + "task_loss": 1.3821601867675781 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.7215943336486816, + "epoch": 0.53, + "learning_rate": 2.650042265426881e-05, + "loss": 1.5655, + "step": 627, + "task_loss": 0.9336962699890137 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.641430377960205, + "epoch": 0.53, + "learning_rate": 2.654268808114962e-05, + "loss": 1.5572, + "step": 628, + "task_loss": 1.1537126302719116 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.4905693531036377, + "epoch": 0.53, + "learning_rate": 2.658495350803043e-05, + "loss": 1.1736, + "step": 629, + "task_loss": 1.3647817373275757 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.776768445968628, + "epoch": 0.53, + "learning_rate": 2.6627218934911247e-05, + "loss": 1.5419, + "step": 630, + "task_loss": 1.5495095252990723 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 2.119025230407715, + "epoch": 0.53, + "learning_rate": 2.6669484361792057e-05, + "loss": 1.7097, + "step": 631, + "task_loss": 0.6255828738212585 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 2.060655117034912, + "epoch": 0.53, + "learning_rate": 2.671174978867287e-05, + "loss": 1.6324, + "step": 632, + "task_loss": 1.9559584856033325 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.8026323318481445, + "epoch": 0.53, + "learning_rate": 2.675401521555368e-05, + "loss": 1.3785, + "step": 633, + "task_loss": 0.9295580983161926 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 2.2322726249694824, + "epoch": 0.54, + "learning_rate": 2.6796280642434492e-05, + "loss": 1.7693, + "step": 634, + "task_loss": 1.053109884262085 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.1831504106521606, + "epoch": 0.54, + "learning_rate": 2.68385460693153e-05, + "loss": 1.3001, + "step": 635, + "task_loss": 0.5708752870559692 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.6532008647918701, + "epoch": 0.54, + "learning_rate": 2.688081149619611e-05, + "loss": 1.6953, + "step": 636, + "task_loss": 1.8451604843139648 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.2908958196640015, + "epoch": 0.54, + "learning_rate": 2.6923076923076923e-05, + "loss": 1.5398, + "step": 637, + "task_loss": 0.7879374623298645 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.3271689414978027, + "epoch": 0.54, + "learning_rate": 2.6965342349957733e-05, + "loss": 1.4815, + "step": 638, + "task_loss": 1.635392665863037 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.4123780727386475, + "epoch": 0.54, + "learning_rate": 2.7007607776838545e-05, + "loss": 1.8863, + "step": 639, + "task_loss": 0.2529791593551636 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 2.9431493282318115, + "epoch": 0.54, + "learning_rate": 2.704987320371936e-05, + "loss": 1.9035, + "step": 640, + "task_loss": 1.4493820667266846 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.060360074043274, + "epoch": 0.54, + "learning_rate": 2.709213863060017e-05, + "loss": 1.5586, + "step": 641, + "task_loss": 0.9246127605438232 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.6228567361831665, + "epoch": 0.54, + "learning_rate": 2.7134404057480984e-05, + "loss": 1.7603, + "step": 642, + "task_loss": 1.724980354309082 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 2.020940065383911, + "epoch": 0.54, + "learning_rate": 2.7176669484361793e-05, + "loss": 1.8246, + "step": 643, + "task_loss": 1.4467122554779053 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.1339294910430908, + "epoch": 0.54, + "learning_rate": 2.7218934911242606e-05, + "loss": 1.6619, + "step": 644, + "task_loss": 0.595633864402771 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.6338285207748413, + "epoch": 0.54, + "learning_rate": 2.7261200338123415e-05, + "loss": 1.6148, + "step": 645, + "task_loss": 0.931357204914093 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.5364899635314941, + "epoch": 0.55, + "learning_rate": 2.7303465765004228e-05, + "loss": 1.5055, + "step": 646, + "task_loss": 0.731842041015625 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.7353875637054443, + "epoch": 0.55, + "learning_rate": 2.7345731191885038e-05, + "loss": 1.2771, + "step": 647, + "task_loss": 1.078820824623108 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.5952110290527344, + "epoch": 0.55, + "learning_rate": 2.738799661876585e-05, + "loss": 1.6303, + "step": 648, + "task_loss": 0.7135847806930542 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.6236639022827148, + "epoch": 0.55, + "learning_rate": 2.743026204564666e-05, + "loss": 1.2467, + "step": 649, + "task_loss": 0.8168407678604126 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.8265674114227295, + "epoch": 0.55, + "learning_rate": 2.7472527472527476e-05, + "loss": 1.328, + "step": 650, + "task_loss": 1.279393196105957 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.9181575775146484, + "epoch": 0.55, + "learning_rate": 2.751479289940829e-05, + "loss": 1.808, + "step": 651, + "task_loss": 1.327600121498108 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.4949519634246826, + "epoch": 0.55, + "learning_rate": 2.7557058326289098e-05, + "loss": 1.078, + "step": 652, + "task_loss": 0.9232909083366394 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.5886056423187256, + "epoch": 0.55, + "learning_rate": 2.7599323753169907e-05, + "loss": 1.3246, + "step": 653, + "task_loss": 0.3847731649875641 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.26932692527771, + "epoch": 0.55, + "learning_rate": 2.764158918005072e-05, + "loss": 1.1954, + "step": 654, + "task_loss": 0.9609280824661255 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 2.102677583694458, + "epoch": 0.55, + "learning_rate": 2.768385460693153e-05, + "loss": 1.4832, + "step": 655, + "task_loss": 0.8220967650413513 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.7277220487594604, + "epoch": 0.55, + "learning_rate": 2.7726120033812342e-05, + "loss": 1.522, + "step": 656, + "task_loss": 1.0032366514205933 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.0998575687408447, + "epoch": 0.56, + "learning_rate": 2.7768385460693152e-05, + "loss": 1.2439, + "step": 657, + "task_loss": 0.7112149000167847 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.5570416450500488, + "epoch": 0.56, + "learning_rate": 2.7810650887573965e-05, + "loss": 1.5928, + "step": 658, + "task_loss": 0.7085965275764465 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.4043692350387573, + "epoch": 0.56, + "learning_rate": 2.785291631445478e-05, + "loss": 1.5282, + "step": 659, + "task_loss": 1.1843066215515137 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.8886358737945557, + "epoch": 0.56, + "learning_rate": 2.789518174133559e-05, + "loss": 1.8172, + "step": 660, + "task_loss": 0.597008466720581 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.790836215019226, + "epoch": 0.56, + "learning_rate": 2.7937447168216403e-05, + "loss": 1.711, + "step": 661, + "task_loss": 0.9670142531394958 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.763425350189209, + "epoch": 0.56, + "learning_rate": 2.7979712595097212e-05, + "loss": 1.5536, + "step": 662, + "task_loss": 1.916986346244812 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.6089192628860474, + "epoch": 0.56, + "learning_rate": 2.8021978021978025e-05, + "loss": 1.5651, + "step": 663, + "task_loss": 0.8449296355247498 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.6258020401000977, + "epoch": 0.56, + "learning_rate": 2.8064243448858834e-05, + "loss": 1.6864, + "step": 664, + "task_loss": 0.9355388879776001 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 2.0082406997680664, + "epoch": 0.56, + "learning_rate": 2.8106508875739644e-05, + "loss": 1.4254, + "step": 665, + "task_loss": 1.5960066318511963 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.8274633884429932, + "epoch": 0.56, + "learning_rate": 2.8148774302620457e-05, + "loss": 1.3565, + "step": 666, + "task_loss": 1.0102416276931763 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.732003092765808, + "epoch": 0.56, + "learning_rate": 2.8191039729501266e-05, + "loss": 1.3596, + "step": 667, + "task_loss": 1.3154973983764648 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.780278205871582, + "epoch": 0.56, + "learning_rate": 2.823330515638208e-05, + "loss": 1.5606, + "step": 668, + "task_loss": 1.7912262678146362 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.5741322040557861, + "epoch": 0.57, + "learning_rate": 2.8275570583262895e-05, + "loss": 1.4136, + "step": 669, + "task_loss": 1.7995178699493408 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.7928142547607422, + "epoch": 0.57, + "learning_rate": 2.8317836010143704e-05, + "loss": 1.5902, + "step": 670, + "task_loss": 1.5148922204971313 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.6320691108703613, + "epoch": 0.57, + "learning_rate": 2.8360101437024517e-05, + "loss": 1.8178, + "step": 671, + "task_loss": 1.483723521232605 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.1542911529541016, + "epoch": 0.57, + "learning_rate": 2.8402366863905327e-05, + "loss": 1.4133, + "step": 672, + "task_loss": 0.6468456983566284 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.7527374029159546, + "epoch": 0.57, + "learning_rate": 2.844463229078614e-05, + "loss": 1.4153, + "step": 673, + "task_loss": 1.983681082725525 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.0837173461914062, + "epoch": 0.57, + "learning_rate": 2.848689771766695e-05, + "loss": 1.3309, + "step": 674, + "task_loss": 0.7219658493995667 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 2.297276020050049, + "epoch": 0.57, + "learning_rate": 2.852916314454776e-05, + "loss": 1.7485, + "step": 675, + "task_loss": 1.6987696886062622 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.675417184829712, + "epoch": 0.57, + "learning_rate": 2.857142857142857e-05, + "loss": 1.6139, + "step": 676, + "task_loss": 0.8685016632080078 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.4889869689941406, + "epoch": 0.57, + "learning_rate": 2.8613693998309384e-05, + "loss": 1.454, + "step": 677, + "task_loss": 0.6011234521865845 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.4284563064575195, + "epoch": 0.57, + "learning_rate": 2.8655959425190193e-05, + "loss": 1.3535, + "step": 678, + "task_loss": 0.5318889617919922 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.252799153327942, + "epoch": 0.57, + "learning_rate": 2.869822485207101e-05, + "loss": 1.405, + "step": 679, + "task_loss": 1.4317924976348877 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.4981613159179688, + "epoch": 0.57, + "learning_rate": 2.8740490278951822e-05, + "loss": 1.3115, + "step": 680, + "task_loss": 1.30126953125 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 2.060190439224243, + "epoch": 0.58, + "learning_rate": 2.878275570583263e-05, + "loss": 1.6098, + "step": 681, + "task_loss": 0.8986676931381226 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.408935546875, + "epoch": 0.58, + "learning_rate": 2.882502113271344e-05, + "loss": 1.6122, + "step": 682, + "task_loss": 0.6317296624183655 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 2.0578150749206543, + "epoch": 0.58, + "learning_rate": 2.8867286559594254e-05, + "loss": 1.4871, + "step": 683, + "task_loss": 1.127751111984253 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.006550908088684, + "epoch": 0.58, + "learning_rate": 2.8909551986475063e-05, + "loss": 1.1262, + "step": 684, + "task_loss": 1.1229885816574097 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.8016066551208496, + "epoch": 0.58, + "learning_rate": 2.8951817413355876e-05, + "loss": 1.368, + "step": 685, + "task_loss": 1.1983249187469482 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 2.1605045795440674, + "epoch": 0.58, + "learning_rate": 2.8994082840236685e-05, + "loss": 1.494, + "step": 686, + "task_loss": 0.6800357699394226 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.5272266864776611, + "epoch": 0.58, + "learning_rate": 2.9036348267117498e-05, + "loss": 1.3462, + "step": 687, + "task_loss": 1.8150286674499512 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.0030484199523926, + "epoch": 0.58, + "learning_rate": 2.9078613693998314e-05, + "loss": 1.1387, + "step": 688, + "task_loss": 0.8048574924468994 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.40828275680542, + "epoch": 0.58, + "learning_rate": 2.9120879120879123e-05, + "loss": 1.3236, + "step": 689, + "task_loss": 0.5279685258865356 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.1242307424545288, + "epoch": 0.58, + "learning_rate": 2.9163144547759936e-05, + "loss": 1.4809, + "step": 690, + "task_loss": 0.4531760513782501 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.8669126033782959, + "epoch": 0.58, + "learning_rate": 2.9205409974640746e-05, + "loss": 1.2519, + "step": 691, + "task_loss": 0.3343375623226166 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.5432629585266113, + "epoch": 0.58, + "learning_rate": 2.924767540152156e-05, + "loss": 1.5538, + "step": 692, + "task_loss": 1.3851969242095947 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.4076192378997803, + "epoch": 0.59, + "learning_rate": 2.9289940828402368e-05, + "loss": 1.4887, + "step": 693, + "task_loss": 1.8642549514770508 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 2.7077107429504395, + "epoch": 0.59, + "learning_rate": 2.933220625528318e-05, + "loss": 1.51, + "step": 694, + "task_loss": 1.6698142290115356 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.5578193664550781, + "epoch": 0.59, + "learning_rate": 2.937447168216399e-05, + "loss": 1.4737, + "step": 695, + "task_loss": 1.489617943763733 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.1586830615997314, + "epoch": 0.59, + "learning_rate": 2.94167371090448e-05, + "loss": 1.2867, + "step": 696, + "task_loss": 0.6294510364532471 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.1899855136871338, + "epoch": 0.59, + "learning_rate": 2.9459002535925612e-05, + "loss": 1.1781, + "step": 697, + "task_loss": 0.8063412308692932 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.8221255540847778, + "epoch": 0.59, + "learning_rate": 2.9501267962806428e-05, + "loss": 1.4055, + "step": 698, + "task_loss": 0.6587286591529846 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.3647469282150269, + "epoch": 0.59, + "learning_rate": 2.9543533389687238e-05, + "loss": 1.3671, + "step": 699, + "task_loss": 0.5070088505744934 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.3376853466033936, + "epoch": 0.59, + "learning_rate": 2.958579881656805e-05, + "loss": 1.4248, + "step": 700, + "task_loss": 0.905743420124054 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.329820990562439, + "epoch": 0.59, + "learning_rate": 2.962806424344886e-05, + "loss": 1.5463, + "step": 701, + "task_loss": 0.8405337929725647 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7667867541313171, + "epoch": 0.59, + "learning_rate": 2.9670329670329673e-05, + "loss": 1.0905, + "step": 702, + "task_loss": 0.7684465646743774 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.4072433710098267, + "epoch": 0.59, + "learning_rate": 2.9712595097210482e-05, + "loss": 1.3943, + "step": 703, + "task_loss": 1.0100133419036865 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.9178508520126343, + "epoch": 0.59, + "learning_rate": 2.9754860524091295e-05, + "loss": 1.786, + "step": 704, + "task_loss": 1.9493777751922607 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.9487297534942627, + "epoch": 0.6, + "learning_rate": 2.9797125950972104e-05, + "loss": 1.4663, + "step": 705, + "task_loss": 2.259042263031006 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.528301477432251, + "epoch": 0.6, + "learning_rate": 2.9839391377852917e-05, + "loss": 1.1631, + "step": 706, + "task_loss": 0.28674349188804626 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.3450632095336914, + "epoch": 0.6, + "learning_rate": 2.9881656804733733e-05, + "loss": 1.4949, + "step": 707, + "task_loss": 1.143014907836914 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.409010887145996, + "epoch": 0.6, + "learning_rate": 2.9923922231614543e-05, + "loss": 1.3435, + "step": 708, + "task_loss": 1.0360783338546753 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.161576747894287, + "epoch": 0.6, + "learning_rate": 2.9966187658495355e-05, + "loss": 1.7862, + "step": 709, + "task_loss": 0.5741552710533142 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.795288622379303, + "epoch": 0.6, + "learning_rate": 3.0008453085376165e-05, + "loss": 1.4654, + "step": 710, + "task_loss": 1.225522518157959 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.9617218971252441, + "epoch": 0.6, + "learning_rate": 3.0050718512256974e-05, + "loss": 1.4648, + "step": 711, + "task_loss": 0.9943562746047974 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.620964765548706, + "epoch": 0.6, + "learning_rate": 3.0092983939137787e-05, + "loss": 1.6005, + "step": 712, + "task_loss": 1.2757292985916138 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.2072701454162598, + "epoch": 0.6, + "learning_rate": 3.0135249366018596e-05, + "loss": 1.0605, + "step": 713, + "task_loss": 0.8487045168876648 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.8367528915405273, + "epoch": 0.6, + "learning_rate": 3.017751479289941e-05, + "loss": 1.8295, + "step": 714, + "task_loss": 0.8496079444885254 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.1372811794281006, + "epoch": 0.6, + "learning_rate": 3.021978021978022e-05, + "loss": 1.1556, + "step": 715, + "task_loss": 0.845334529876709 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.0037188529968262, + "epoch": 0.6, + "learning_rate": 3.026204564666103e-05, + "loss": 1.0427, + "step": 716, + "task_loss": 1.1058974266052246 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.3244569301605225, + "epoch": 0.61, + "learning_rate": 3.0304311073541847e-05, + "loss": 1.3545, + "step": 717, + "task_loss": 1.2813187837600708 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.1645981073379517, + "epoch": 0.61, + "learning_rate": 3.0346576500422657e-05, + "loss": 1.2698, + "step": 718, + "task_loss": 0.7977056503295898 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.3454211950302124, + "epoch": 0.61, + "learning_rate": 3.038884192730347e-05, + "loss": 1.3901, + "step": 719, + "task_loss": 1.0199166536331177 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.060537338256836, + "epoch": 0.61, + "learning_rate": 3.043110735418428e-05, + "loss": 1.5656, + "step": 720, + "task_loss": 1.0599092245101929 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.6241382360458374, + "epoch": 0.61, + "learning_rate": 3.047337278106509e-05, + "loss": 1.2238, + "step": 721, + "task_loss": 2.105393171310425 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.5769617557525635, + "epoch": 0.61, + "learning_rate": 3.05156382079459e-05, + "loss": 1.1988, + "step": 722, + "task_loss": 0.9134297370910645 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.0414657592773438, + "epoch": 0.61, + "learning_rate": 3.0557903634826714e-05, + "loss": 1.5899, + "step": 723, + "task_loss": 0.6078497171401978 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.5410141944885254, + "epoch": 0.61, + "learning_rate": 3.060016906170752e-05, + "loss": 1.3623, + "step": 724, + "task_loss": 1.8485307693481445 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.8764464855194092, + "epoch": 0.61, + "learning_rate": 3.064243448858833e-05, + "loss": 1.3635, + "step": 725, + "task_loss": 1.9136000871658325 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.1411190032958984, + "epoch": 0.61, + "learning_rate": 3.068469991546914e-05, + "loss": 1.4542, + "step": 726, + "task_loss": 0.7353646159172058 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.1371047496795654, + "epoch": 0.61, + "learning_rate": 3.072696534234996e-05, + "loss": 1.1312, + "step": 727, + "task_loss": 1.0274072885513306 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.9875434041023254, + "epoch": 0.61, + "learning_rate": 3.0769230769230774e-05, + "loss": 1.1656, + "step": 728, + "task_loss": 0.7100459933280945 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.0378708839416504, + "epoch": 0.62, + "learning_rate": 3.0811496196111584e-05, + "loss": 1.4261, + "step": 729, + "task_loss": 1.214665412902832 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.5776360034942627, + "epoch": 0.62, + "learning_rate": 3.085376162299239e-05, + "loss": 1.4927, + "step": 730, + "task_loss": 1.2391765117645264 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.9539617300033569, + "epoch": 0.62, + "learning_rate": 3.08960270498732e-05, + "loss": 1.1092, + "step": 731, + "task_loss": 1.031744122505188 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7882373332977295, + "epoch": 0.62, + "learning_rate": 3.093829247675402e-05, + "loss": 1.2236, + "step": 732, + "task_loss": 1.0188167095184326 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.8838756084442139, + "epoch": 0.62, + "learning_rate": 3.098055790363483e-05, + "loss": 1.3281, + "step": 733, + "task_loss": 1.997550368309021 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.2788608074188232, + "epoch": 0.62, + "learning_rate": 3.102282333051564e-05, + "loss": 1.3928, + "step": 734, + "task_loss": 0.8079401850700378 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7655508518218994, + "epoch": 0.62, + "learning_rate": 3.106508875739645e-05, + "loss": 1.3034, + "step": 735, + "task_loss": 0.6227017045021057 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.4453158378601074, + "epoch": 0.62, + "learning_rate": 3.110735418427726e-05, + "loss": 1.3911, + "step": 736, + "task_loss": 0.9161060452461243 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.3663349151611328, + "epoch": 0.62, + "learning_rate": 3.114961961115808e-05, + "loss": 0.9101, + "step": 737, + "task_loss": 0.842225193977356 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.5438990592956543, + "epoch": 0.62, + "learning_rate": 3.119188503803889e-05, + "loss": 1.5341, + "step": 738, + "task_loss": 0.47739672660827637 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.2349551916122437, + "epoch": 0.62, + "learning_rate": 3.12341504649197e-05, + "loss": 1.396, + "step": 739, + "task_loss": 0.9894749522209167 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.0047895908355713, + "epoch": 0.63, + "learning_rate": 3.127641589180051e-05, + "loss": 0.8866, + "step": 740, + "task_loss": 0.9324040412902832 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.4260210990905762, + "epoch": 0.63, + "learning_rate": 3.131868131868132e-05, + "loss": 1.2471, + "step": 741, + "task_loss": 0.3376115560531616 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.7668278217315674, + "epoch": 0.63, + "learning_rate": 3.136094674556213e-05, + "loss": 1.3282, + "step": 742, + "task_loss": 1.7642693519592285 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.4765636920928955, + "epoch": 0.63, + "learning_rate": 3.140321217244294e-05, + "loss": 1.3615, + "step": 743, + "task_loss": 1.8432987928390503 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.4979791641235352, + "epoch": 0.63, + "learning_rate": 3.144547759932375e-05, + "loss": 1.2393, + "step": 744, + "task_loss": 1.9573218822479248 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.3229753971099854, + "epoch": 0.63, + "learning_rate": 3.148774302620456e-05, + "loss": 1.2009, + "step": 745, + "task_loss": 0.6425274610519409 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.9752696752548218, + "epoch": 0.63, + "learning_rate": 3.153000845308538e-05, + "loss": 1.2667, + "step": 746, + "task_loss": 1.6002529859542847 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6821339726448059, + "epoch": 0.63, + "learning_rate": 3.1572273879966193e-05, + "loss": 1.0693, + "step": 747, + "task_loss": 0.3564088046550751 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.3711963891983032, + "epoch": 0.63, + "learning_rate": 3.1614539306847e-05, + "loss": 1.3266, + "step": 748, + "task_loss": 1.7210822105407715 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7442446351051331, + "epoch": 0.63, + "learning_rate": 3.165680473372781e-05, + "loss": 1.0009, + "step": 749, + "task_loss": 0.538000226020813 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.6805883646011353, + "epoch": 0.63, + "learning_rate": 3.169907016060862e-05, + "loss": 1.4701, + "step": 750, + "task_loss": 0.820415735244751 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.4134013652801514, + "epoch": 0.63, + "learning_rate": 3.174133558748944e-05, + "loss": 1.1963, + "step": 751, + "task_loss": 1.3457452058792114 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.8930699825286865, + "epoch": 0.64, + "learning_rate": 3.178360101437025e-05, + "loss": 1.2316, + "step": 752, + "task_loss": 1.210572361946106 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.1586639881134033, + "epoch": 0.64, + "learning_rate": 3.1825866441251057e-05, + "loss": 1.1674, + "step": 753, + "task_loss": 1.4667516946792603 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.9502545595169067, + "epoch": 0.64, + "learning_rate": 3.1868131868131866e-05, + "loss": 0.8921, + "step": 754, + "task_loss": 0.8325430750846863 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.9357709884643555, + "epoch": 0.64, + "learning_rate": 3.1910397295012675e-05, + "loss": 0.9295, + "step": 755, + "task_loss": 0.762615442276001 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7579373717308044, + "epoch": 0.64, + "learning_rate": 3.195266272189349e-05, + "loss": 1.0436, + "step": 756, + "task_loss": 0.7855313420295715 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.9126052856445312, + "epoch": 0.64, + "learning_rate": 3.199492814877431e-05, + "loss": 1.551, + "step": 757, + "task_loss": 1.4528396129608154 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.0547230243682861, + "epoch": 0.64, + "learning_rate": 3.203719357565512e-05, + "loss": 1.0869, + "step": 758, + "task_loss": 0.6683381795883179 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.0525503158569336, + "epoch": 0.64, + "learning_rate": 3.2079459002535926e-05, + "loss": 0.9953, + "step": 759, + "task_loss": 0.18759943544864655 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.8616998195648193, + "epoch": 0.64, + "learning_rate": 3.2121724429416736e-05, + "loss": 1.3584, + "step": 760, + "task_loss": 0.9390403032302856 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.9221686720848083, + "epoch": 0.64, + "learning_rate": 3.216398985629755e-05, + "loss": 1.3151, + "step": 761, + "task_loss": 0.7151225805282593 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.3533369302749634, + "epoch": 0.64, + "learning_rate": 3.220625528317836e-05, + "loss": 0.9055, + "step": 762, + "task_loss": 1.5168225765228271 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.100745677947998, + "epoch": 0.64, + "learning_rate": 3.224852071005917e-05, + "loss": 1.2536, + "step": 763, + "task_loss": 0.38282209634780884 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.0497146844863892, + "epoch": 0.65, + "learning_rate": 3.229078613693998e-05, + "loss": 1.2617, + "step": 764, + "task_loss": 0.8950813412666321 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.9626649618148804, + "epoch": 0.65, + "learning_rate": 3.2333051563820796e-05, + "loss": 1.4889, + "step": 765, + "task_loss": 0.6804001927375793 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.1379417181015015, + "epoch": 0.65, + "learning_rate": 3.237531699070161e-05, + "loss": 1.2559, + "step": 766, + "task_loss": 0.870660662651062 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.016981840133667, + "epoch": 0.65, + "learning_rate": 3.241758241758242e-05, + "loss": 1.4095, + "step": 767, + "task_loss": 1.0748240947723389 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.8861956000328064, + "epoch": 0.65, + "learning_rate": 3.245984784446323e-05, + "loss": 1.2675, + "step": 768, + "task_loss": 0.615392804145813 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.8045220375061035, + "epoch": 0.65, + "learning_rate": 3.250211327134404e-05, + "loss": 0.9897, + "step": 769, + "task_loss": 1.6343724727630615 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.56914484500885, + "epoch": 0.65, + "learning_rate": 3.254437869822485e-05, + "loss": 1.184, + "step": 770, + "task_loss": 1.6107184886932373 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.2962971925735474, + "epoch": 0.65, + "learning_rate": 3.2586644125105666e-05, + "loss": 1.3232, + "step": 771, + "task_loss": 0.7560681104660034 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.2538061141967773, + "epoch": 0.65, + "learning_rate": 3.2628909551986476e-05, + "loss": 1.0573, + "step": 772, + "task_loss": 1.1959505081176758 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.0524084568023682, + "epoch": 0.65, + "learning_rate": 3.2671174978867285e-05, + "loss": 1.4628, + "step": 773, + "task_loss": 0.5565171837806702 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.1373214721679688, + "epoch": 0.65, + "learning_rate": 3.2713440405748094e-05, + "loss": 1.123, + "step": 774, + "task_loss": 1.6657607555389404 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.0028746128082275, + "epoch": 0.65, + "learning_rate": 3.275570583262891e-05, + "loss": 1.0034, + "step": 775, + "task_loss": 0.790636420249939 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.6192691326141357, + "epoch": 0.66, + "learning_rate": 3.279797125950973e-05, + "loss": 1.3584, + "step": 776, + "task_loss": 1.3156646490097046 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.8031277656555176, + "epoch": 0.66, + "learning_rate": 3.2840236686390536e-05, + "loss": 0.9986, + "step": 777, + "task_loss": 0.15089592337608337 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.4036678075790405, + "epoch": 0.66, + "learning_rate": 3.2882502113271346e-05, + "loss": 0.9864, + "step": 778, + "task_loss": 1.3838911056518555 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.6737282276153564, + "epoch": 0.66, + "learning_rate": 3.2924767540152155e-05, + "loss": 1.0778, + "step": 779, + "task_loss": 0.986405611038208 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.1664226055145264, + "epoch": 0.66, + "learning_rate": 3.296703296703297e-05, + "loss": 1.1947, + "step": 780, + "task_loss": 0.6116193532943726 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7758909463882446, + "epoch": 0.66, + "learning_rate": 3.300929839391378e-05, + "loss": 1.0504, + "step": 781, + "task_loss": 0.7467387318611145 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.0047651529312134, + "epoch": 0.66, + "learning_rate": 3.305156382079459e-05, + "loss": 1.0698, + "step": 782, + "task_loss": 1.6976722478866577 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.9224201440811157, + "epoch": 0.66, + "learning_rate": 3.30938292476754e-05, + "loss": 0.9493, + "step": 783, + "task_loss": 0.40012145042419434 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.945979118347168, + "epoch": 0.66, + "learning_rate": 3.3136094674556215e-05, + "loss": 1.2699, + "step": 784, + "task_loss": 0.8882613778114319 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.4105168581008911, + "epoch": 0.66, + "learning_rate": 3.317836010143703e-05, + "loss": 1.0217, + "step": 785, + "task_loss": 1.1323537826538086 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.0683871507644653, + "epoch": 0.66, + "learning_rate": 3.322062552831784e-05, + "loss": 1.206, + "step": 786, + "task_loss": 0.3326896131038666 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7210911512374878, + "epoch": 0.66, + "learning_rate": 3.326289095519865e-05, + "loss": 1.1225, + "step": 787, + "task_loss": 0.579444408416748 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.0139968395233154, + "epoch": 0.67, + "learning_rate": 3.330515638207946e-05, + "loss": 1.2026, + "step": 788, + "task_loss": 0.6448048949241638 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.266096830368042, + "epoch": 0.67, + "learning_rate": 3.334742180896027e-05, + "loss": 1.1261, + "step": 789, + "task_loss": 0.6363561153411865 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.163905382156372, + "epoch": 0.67, + "learning_rate": 3.3389687235841085e-05, + "loss": 1.2274, + "step": 790, + "task_loss": 1.3863892555236816 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.2620033025741577, + "epoch": 0.67, + "learning_rate": 3.3431952662721895e-05, + "loss": 1.4208, + "step": 791, + "task_loss": 0.47367507219314575 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.8582741022109985, + "epoch": 0.67, + "learning_rate": 3.3474218089602704e-05, + "loss": 1.1125, + "step": 792, + "task_loss": 1.041414737701416 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.0617467164993286, + "epoch": 0.67, + "learning_rate": 3.3516483516483513e-05, + "loss": 1.1794, + "step": 793, + "task_loss": 0.8185568451881409 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.4089443683624268, + "epoch": 0.67, + "learning_rate": 3.355874894336433e-05, + "loss": 1.0588, + "step": 794, + "task_loss": 1.1982653141021729 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.7130084037780762, + "epoch": 0.67, + "learning_rate": 3.3601014370245146e-05, + "loss": 1.2627, + "step": 795, + "task_loss": 0.8388482332229614 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.3874064683914185, + "epoch": 0.67, + "learning_rate": 3.3643279797125955e-05, + "loss": 1.1283, + "step": 796, + "task_loss": 0.9146378636360168 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.582143783569336, + "epoch": 0.67, + "learning_rate": 3.3685545224006765e-05, + "loss": 1.2534, + "step": 797, + "task_loss": 1.2254849672317505 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.9507005214691162, + "epoch": 0.67, + "learning_rate": 3.3727810650887574e-05, + "loss": 1.2953, + "step": 798, + "task_loss": 2.448974609375 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6781567931175232, + "epoch": 0.67, + "learning_rate": 3.377007607776838e-05, + "loss": 1.1329, + "step": 799, + "task_loss": 0.5950313210487366 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.929793119430542, + "epoch": 0.68, + "learning_rate": 3.38123415046492e-05, + "loss": 1.2051, + "step": 800, + "task_loss": 0.6294661164283752 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6224916577339172, + "epoch": 0.68, + "learning_rate": 3.385460693153001e-05, + "loss": 0.9265, + "step": 801, + "task_loss": 0.24103760719299316 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.9142379760742188, + "epoch": 0.68, + "learning_rate": 3.389687235841082e-05, + "loss": 1.3758, + "step": 802, + "task_loss": 1.322835087776184 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.315333604812622, + "epoch": 0.68, + "learning_rate": 3.393913778529163e-05, + "loss": 1.4024, + "step": 803, + "task_loss": 1.4778238534927368 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.067352533340454, + "epoch": 0.68, + "learning_rate": 3.3981403212172444e-05, + "loss": 0.8641, + "step": 804, + "task_loss": 1.5639110803604126 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.243212103843689, + "epoch": 0.68, + "learning_rate": 3.402366863905326e-05, + "loss": 1.1398, + "step": 805, + "task_loss": 0.8332517743110657 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.7636231184005737, + "epoch": 0.68, + "learning_rate": 3.406593406593407e-05, + "loss": 1.3156, + "step": 806, + "task_loss": 1.280535101890564 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.323514699935913, + "epoch": 0.68, + "learning_rate": 3.410819949281488e-05, + "loss": 1.024, + "step": 807, + "task_loss": 0.9092371463775635 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.464396357536316, + "epoch": 0.68, + "learning_rate": 3.415046491969569e-05, + "loss": 1.3036, + "step": 808, + "task_loss": 1.3983303308486938 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.9059646129608154, + "epoch": 0.68, + "learning_rate": 3.4192730346576504e-05, + "loss": 1.0737, + "step": 809, + "task_loss": 1.2400716543197632 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.2472960948944092, + "epoch": 0.68, + "learning_rate": 3.4234995773457314e-05, + "loss": 1.1997, + "step": 810, + "task_loss": 1.3540829420089722 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.8057377338409424, + "epoch": 0.69, + "learning_rate": 3.427726120033812e-05, + "loss": 0.7798, + "step": 811, + "task_loss": 0.2278326153755188 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.0885618925094604, + "epoch": 0.69, + "learning_rate": 3.431952662721893e-05, + "loss": 1.1224, + "step": 812, + "task_loss": 0.5487210154533386 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.021493673324585, + "epoch": 0.69, + "learning_rate": 3.436179205409975e-05, + "loss": 0.9995, + "step": 813, + "task_loss": 0.2578725218772888 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.0298852920532227, + "epoch": 0.69, + "learning_rate": 3.4404057480980565e-05, + "loss": 0.9818, + "step": 814, + "task_loss": 1.0327537059783936 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.2061482667922974, + "epoch": 0.69, + "learning_rate": 3.4446322907861374e-05, + "loss": 1.2802, + "step": 815, + "task_loss": 0.5380061268806458 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5140435695648193, + "epoch": 0.69, + "learning_rate": 3.4488588334742184e-05, + "loss": 0.8722, + "step": 816, + "task_loss": 0.31900647282600403 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.659974217414856, + "epoch": 0.69, + "learning_rate": 3.453085376162299e-05, + "loss": 0.8583, + "step": 817, + "task_loss": 0.7299270033836365 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 2.0004239082336426, + "epoch": 0.69, + "learning_rate": 3.45731191885038e-05, + "loss": 1.4279, + "step": 818, + "task_loss": 0.9796584844589233 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.1866278648376465, + "epoch": 0.69, + "learning_rate": 3.461538461538462e-05, + "loss": 1.0565, + "step": 819, + "task_loss": 1.3525701761245728 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.8020912408828735, + "epoch": 0.69, + "learning_rate": 3.465765004226543e-05, + "loss": 1.4161, + "step": 820, + "task_loss": 1.020184874534607 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7706226110458374, + "epoch": 0.69, + "learning_rate": 3.469991546914624e-05, + "loss": 0.9847, + "step": 821, + "task_loss": 0.7895649075508118 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6366397738456726, + "epoch": 0.69, + "learning_rate": 3.474218089602705e-05, + "loss": 0.98, + "step": 822, + "task_loss": 1.0765902996063232 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.8741238117218018, + "epoch": 0.7, + "learning_rate": 3.478444632290786e-05, + "loss": 0.9404, + "step": 823, + "task_loss": 0.5794886946678162 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.686323881149292, + "epoch": 0.7, + "learning_rate": 3.482671174978868e-05, + "loss": 1.2467, + "step": 824, + "task_loss": 0.9441559314727783 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.8098551034927368, + "epoch": 0.7, + "learning_rate": 3.486897717666949e-05, + "loss": 0.8981, + "step": 825, + "task_loss": 0.3559379577636719 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7981609106063843, + "epoch": 0.7, + "learning_rate": 3.49112426035503e-05, + "loss": 0.9523, + "step": 826, + "task_loss": 0.6286649107933044 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.2563947439193726, + "epoch": 0.7, + "learning_rate": 3.495350803043111e-05, + "loss": 1.0617, + "step": 827, + "task_loss": 1.103269100189209 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.69051194190979, + "epoch": 0.7, + "learning_rate": 3.499577345731192e-05, + "loss": 1.212, + "step": 828, + "task_loss": 0.49598947167396545 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.8391950726509094, + "epoch": 0.7, + "learning_rate": 3.503803888419273e-05, + "loss": 0.9746, + "step": 829, + "task_loss": 0.7219027280807495 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.4685629606246948, + "epoch": 0.7, + "learning_rate": 3.508030431107354e-05, + "loss": 1.1247, + "step": 830, + "task_loss": 1.080963134765625 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.1293282508850098, + "epoch": 0.7, + "learning_rate": 3.512256973795435e-05, + "loss": 1.0864, + "step": 831, + "task_loss": 1.5669431686401367 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.250570297241211, + "epoch": 0.7, + "learning_rate": 3.516483516483517e-05, + "loss": 0.9911, + "step": 832, + "task_loss": 1.3642091751098633 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.125456690788269, + "epoch": 0.7, + "learning_rate": 3.520710059171598e-05, + "loss": 1.1822, + "step": 833, + "task_loss": 2.5408148765563965 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.8799680471420288, + "epoch": 0.7, + "learning_rate": 3.524936601859679e-05, + "loss": 1.1337, + "step": 834, + "task_loss": 1.5254669189453125 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.424616813659668, + "epoch": 0.71, + "learning_rate": 3.52916314454776e-05, + "loss": 1.1357, + "step": 835, + "task_loss": 1.3318520784378052 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.3314528465270996, + "epoch": 0.71, + "learning_rate": 3.533389687235841e-05, + "loss": 1.002, + "step": 836, + "task_loss": 1.339030146598816 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7002354860305786, + "epoch": 0.71, + "learning_rate": 3.537616229923922e-05, + "loss": 1.1341, + "step": 837, + "task_loss": 0.7738394737243652 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.8383018374443054, + "epoch": 0.71, + "learning_rate": 3.541842772612004e-05, + "loss": 1.1663, + "step": 838, + "task_loss": 0.9395821690559387 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.2065908908843994, + "epoch": 0.71, + "learning_rate": 3.546069315300085e-05, + "loss": 1.1805, + "step": 839, + "task_loss": 0.7872021198272705 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7227573394775391, + "epoch": 0.71, + "learning_rate": 3.5502958579881656e-05, + "loss": 0.8076, + "step": 840, + "task_loss": 1.3456993103027344 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.201753854751587, + "epoch": 0.71, + "learning_rate": 3.5545224006762466e-05, + "loss": 1.2693, + "step": 841, + "task_loss": 0.9918860793113708 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.8004281520843506, + "epoch": 0.71, + "learning_rate": 3.558748943364328e-05, + "loss": 1.2814, + "step": 842, + "task_loss": 1.1303761005401611 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5839501619338989, + "epoch": 0.71, + "learning_rate": 3.56297548605241e-05, + "loss": 1.0925, + "step": 843, + "task_loss": 0.3605928421020508 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.9251384139060974, + "epoch": 0.71, + "learning_rate": 3.567202028740491e-05, + "loss": 1.074, + "step": 844, + "task_loss": 0.2997857332229614 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.5477241277694702, + "epoch": 0.71, + "learning_rate": 3.571428571428572e-05, + "loss": 1.2679, + "step": 845, + "task_loss": 0.6827241778373718 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.4141407012939453, + "epoch": 0.71, + "learning_rate": 3.5756551141166526e-05, + "loss": 1.1589, + "step": 846, + "task_loss": 0.8392406105995178 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.366410255432129, + "epoch": 0.72, + "learning_rate": 3.5798816568047336e-05, + "loss": 1.0562, + "step": 847, + "task_loss": 1.897608757019043 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.123450756072998, + "epoch": 0.72, + "learning_rate": 3.584108199492815e-05, + "loss": 1.1623, + "step": 848, + "task_loss": 1.5228495597839355 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.6552289724349976, + "epoch": 0.72, + "learning_rate": 3.588334742180896e-05, + "loss": 1.0923, + "step": 849, + "task_loss": 0.8985643982887268 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6603604555130005, + "epoch": 0.72, + "learning_rate": 3.592561284868977e-05, + "loss": 0.8798, + "step": 850, + "task_loss": 1.2749196290969849 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.0338457822799683, + "epoch": 0.72, + "learning_rate": 3.596787827557058e-05, + "loss": 1.1585, + "step": 851, + "task_loss": 1.3262828588485718 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.2368450164794922, + "epoch": 0.72, + "learning_rate": 3.6010143702451396e-05, + "loss": 1.0772, + "step": 852, + "task_loss": 1.0315959453582764 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.005903720855713, + "epoch": 0.72, + "learning_rate": 3.605240912933221e-05, + "loss": 1.2439, + "step": 853, + "task_loss": 0.48447489738464355 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.346367359161377, + "epoch": 0.72, + "learning_rate": 3.609467455621302e-05, + "loss": 0.8882, + "step": 854, + "task_loss": 0.4428337514400482 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.336367130279541, + "epoch": 0.72, + "learning_rate": 3.613693998309383e-05, + "loss": 1.0771, + "step": 855, + "task_loss": 1.4890811443328857 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.063593864440918, + "epoch": 0.72, + "learning_rate": 3.617920540997464e-05, + "loss": 0.9776, + "step": 856, + "task_loss": 0.37484872341156006 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.009964108467102, + "epoch": 0.72, + "learning_rate": 3.622147083685546e-05, + "loss": 0.9593, + "step": 857, + "task_loss": 0.6104087829589844 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.0258214473724365, + "epoch": 0.72, + "learning_rate": 3.6263736263736266e-05, + "loss": 1.1007, + "step": 858, + "task_loss": 1.0949828624725342 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.2210016250610352, + "epoch": 0.73, + "learning_rate": 3.6306001690617076e-05, + "loss": 1.0817, + "step": 859, + "task_loss": 1.6341801881790161 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.1012241840362549, + "epoch": 0.73, + "learning_rate": 3.6348267117497885e-05, + "loss": 0.7995, + "step": 860, + "task_loss": 0.48184433579444885 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.4935442209243774, + "epoch": 0.73, + "learning_rate": 3.63905325443787e-05, + "loss": 1.2728, + "step": 861, + "task_loss": 0.46124008297920227 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7171981930732727, + "epoch": 0.73, + "learning_rate": 3.643279797125951e-05, + "loss": 1.016, + "step": 862, + "task_loss": 1.1392186880111694 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.4614613056182861, + "epoch": 0.73, + "learning_rate": 3.647506339814033e-05, + "loss": 1.0917, + "step": 863, + "task_loss": 1.782379150390625 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.0746396780014038, + "epoch": 0.73, + "learning_rate": 3.6517328825021136e-05, + "loss": 0.9524, + "step": 864, + "task_loss": 0.4085269570350647 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.9016843438148499, + "epoch": 0.73, + "learning_rate": 3.6559594251901945e-05, + "loss": 0.9237, + "step": 865, + "task_loss": 1.166468620300293 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4837757349014282, + "epoch": 0.73, + "learning_rate": 3.6601859678782755e-05, + "loss": 0.769, + "step": 866, + "task_loss": 0.3351168632507324 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.8171372413635254, + "epoch": 0.73, + "learning_rate": 3.664412510566357e-05, + "loss": 0.8371, + "step": 867, + "task_loss": 1.5543311834335327 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.1961016654968262, + "epoch": 0.73, + "learning_rate": 3.668639053254438e-05, + "loss": 1.2246, + "step": 868, + "task_loss": 0.3919358253479004 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.7490168809890747, + "epoch": 0.73, + "learning_rate": 3.672865595942519e-05, + "loss": 1.3098, + "step": 869, + "task_loss": 0.8026441931724548 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.9559720754623413, + "epoch": 0.73, + "learning_rate": 3.6770921386306e-05, + "loss": 1.0877, + "step": 870, + "task_loss": 0.7977285385131836 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.8871562480926514, + "epoch": 0.74, + "learning_rate": 3.6813186813186815e-05, + "loss": 0.9033, + "step": 871, + "task_loss": 1.9800527095794678 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.6465833187103271, + "epoch": 0.74, + "learning_rate": 3.685545224006763e-05, + "loss": 1.0784, + "step": 872, + "task_loss": 1.458868145942688 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.9516457319259644, + "epoch": 0.74, + "learning_rate": 3.689771766694844e-05, + "loss": 0.9062, + "step": 873, + "task_loss": 0.631178617477417 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7906115651130676, + "epoch": 0.74, + "learning_rate": 3.693998309382925e-05, + "loss": 0.9737, + "step": 874, + "task_loss": 0.678638756275177 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.786811351776123, + "epoch": 0.74, + "learning_rate": 3.698224852071006e-05, + "loss": 0.8342, + "step": 875, + "task_loss": 0.5169845819473267 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.9709447622299194, + "epoch": 0.74, + "learning_rate": 3.702451394759087e-05, + "loss": 0.9206, + "step": 876, + "task_loss": 0.7110479474067688 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.1406593322753906, + "epoch": 0.74, + "learning_rate": 3.7066779374471685e-05, + "loss": 1.0922, + "step": 877, + "task_loss": 1.4186698198318481 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.0560839176177979, + "epoch": 0.74, + "learning_rate": 3.7109044801352495e-05, + "loss": 0.995, + "step": 878, + "task_loss": 0.876980185508728 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.1954360008239746, + "epoch": 0.74, + "learning_rate": 3.7151310228233304e-05, + "loss": 1.2067, + "step": 879, + "task_loss": 0.7943779230117798 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.3926739692687988, + "epoch": 0.74, + "learning_rate": 3.7193575655114113e-05, + "loss": 1.2232, + "step": 880, + "task_loss": 1.330909013748169 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.8508328795433044, + "epoch": 0.74, + "learning_rate": 3.723584108199493e-05, + "loss": 0.8478, + "step": 881, + "task_loss": 0.2635786831378937 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.735930323600769, + "epoch": 0.75, + "learning_rate": 3.7278106508875746e-05, + "loss": 1.1152, + "step": 882, + "task_loss": 0.3356916010379791 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.267694354057312, + "epoch": 0.75, + "learning_rate": 3.7320371935756555e-05, + "loss": 1.004, + "step": 883, + "task_loss": 1.1062166690826416 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.9897673726081848, + "epoch": 0.75, + "learning_rate": 3.7362637362637365e-05, + "loss": 1.0358, + "step": 884, + "task_loss": 0.405316561460495 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.065746784210205, + "epoch": 0.75, + "learning_rate": 3.7404902789518174e-05, + "loss": 0.8564, + "step": 885, + "task_loss": 1.871180534362793 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.9644671082496643, + "epoch": 0.75, + "learning_rate": 3.744716821639899e-05, + "loss": 0.8845, + "step": 886, + "task_loss": 0.7812583446502686 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5859856009483337, + "epoch": 0.75, + "learning_rate": 3.74894336432798e-05, + "loss": 0.9276, + "step": 887, + "task_loss": 0.9026466012001038 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.0855333805084229, + "epoch": 0.75, + "learning_rate": 3.753169907016061e-05, + "loss": 0.8677, + "step": 888, + "task_loss": 0.5261902809143066 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.8959407806396484, + "epoch": 0.75, + "learning_rate": 3.757396449704142e-05, + "loss": 0.8746, + "step": 889, + "task_loss": 0.6883710026741028 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.2657511234283447, + "epoch": 0.75, + "learning_rate": 3.7616229923922234e-05, + "loss": 1.095, + "step": 890, + "task_loss": 0.5865076780319214 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.5554494857788086, + "epoch": 0.75, + "learning_rate": 3.7658495350803044e-05, + "loss": 1.1688, + "step": 891, + "task_loss": 2.1562516689300537 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.9893829822540283, + "epoch": 0.75, + "learning_rate": 3.770076077768386e-05, + "loss": 0.9013, + "step": 892, + "task_loss": 1.5681660175323486 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6857393980026245, + "epoch": 0.75, + "learning_rate": 3.774302620456467e-05, + "loss": 1.0543, + "step": 893, + "task_loss": 0.49078652262687683 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.0735504627227783, + "epoch": 0.76, + "learning_rate": 3.778529163144548e-05, + "loss": 1.0778, + "step": 894, + "task_loss": 1.7317439317703247 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.105076789855957, + "epoch": 0.76, + "learning_rate": 3.782755705832629e-05, + "loss": 0.9484, + "step": 895, + "task_loss": 0.2082778513431549 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.031673789024353, + "epoch": 0.76, + "learning_rate": 3.7869822485207104e-05, + "loss": 1.1305, + "step": 896, + "task_loss": 0.7247858047485352 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7948950529098511, + "epoch": 0.76, + "learning_rate": 3.7912087912087914e-05, + "loss": 1.1038, + "step": 897, + "task_loss": 1.256655216217041 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.878950834274292, + "epoch": 0.76, + "learning_rate": 3.795435333896872e-05, + "loss": 0.8788, + "step": 898, + "task_loss": 0.3764266073703766 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7968717217445374, + "epoch": 0.76, + "learning_rate": 3.799661876584953e-05, + "loss": 0.9649, + "step": 899, + "task_loss": 1.6645703315734863 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.8570467233657837, + "epoch": 0.76, + "learning_rate": 3.803888419273035e-05, + "loss": 0.8179, + "step": 900, + "task_loss": 0.5114364624023438 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.9215599298477173, + "epoch": 0.76, + "learning_rate": 3.8081149619611165e-05, + "loss": 0.9906, + "step": 901, + "task_loss": 1.2084954977035522 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.0516382455825806, + "epoch": 0.76, + "learning_rate": 3.8123415046491974e-05, + "loss": 0.8247, + "step": 902, + "task_loss": 0.9142789244651794 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.994260847568512, + "epoch": 0.76, + "learning_rate": 3.8165680473372784e-05, + "loss": 0.9231, + "step": 903, + "task_loss": 1.0206170082092285 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.0775905847549438, + "epoch": 0.76, + "learning_rate": 3.820794590025359e-05, + "loss": 1.1574, + "step": 904, + "task_loss": 1.0544328689575195 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.3944138288497925, + "epoch": 0.76, + "learning_rate": 3.82502113271344e-05, + "loss": 0.9063, + "step": 905, + "task_loss": 1.109188437461853 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.9764843583106995, + "epoch": 0.77, + "learning_rate": 3.829247675401522e-05, + "loss": 0.9735, + "step": 906, + "task_loss": 1.8123575448989868 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.017120122909546, + "epoch": 0.77, + "learning_rate": 3.833474218089603e-05, + "loss": 1.1359, + "step": 907, + "task_loss": 0.9205818176269531 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.0457156896591187, + "epoch": 0.77, + "learning_rate": 3.837700760777684e-05, + "loss": 0.8679, + "step": 908, + "task_loss": 2.0430691242218018 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.8560712337493896, + "epoch": 0.77, + "learning_rate": 3.8419273034657653e-05, + "loss": 1.013, + "step": 909, + "task_loss": 0.6274436712265015 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.9065303206443787, + "epoch": 0.77, + "learning_rate": 3.846153846153846e-05, + "loss": 1.0423, + "step": 910, + "task_loss": 0.7250082492828369 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.161895751953125, + "epoch": 0.77, + "learning_rate": 3.850380388841928e-05, + "loss": 1.1945, + "step": 911, + "task_loss": 1.1996887922286987 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.2840684652328491, + "epoch": 0.77, + "learning_rate": 3.854606931530009e-05, + "loss": 1.1767, + "step": 912, + "task_loss": 0.7904782295227051 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.3081060647964478, + "epoch": 0.77, + "learning_rate": 3.85883347421809e-05, + "loss": 1.0609, + "step": 913, + "task_loss": 1.5624157190322876 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.021205186843872, + "epoch": 0.77, + "learning_rate": 3.863060016906171e-05, + "loss": 1.0364, + "step": 914, + "task_loss": 1.558895468711853 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.0080324411392212, + "epoch": 0.77, + "learning_rate": 3.867286559594252e-05, + "loss": 0.7807, + "step": 915, + "task_loss": 1.084423542022705 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6091734170913696, + "epoch": 0.77, + "learning_rate": 3.871513102282333e-05, + "loss": 0.8504, + "step": 916, + "task_loss": 0.7098196744918823 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.1158721446990967, + "epoch": 0.77, + "learning_rate": 3.875739644970414e-05, + "loss": 0.874, + "step": 917, + "task_loss": 1.6732234954833984 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.121085286140442, + "epoch": 0.78, + "learning_rate": 3.879966187658495e-05, + "loss": 1.0027, + "step": 918, + "task_loss": 1.22336745262146 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5889465808868408, + "epoch": 0.78, + "learning_rate": 3.884192730346577e-05, + "loss": 0.8872, + "step": 919, + "task_loss": 0.39150503277778625 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.6394715309143066, + "epoch": 0.78, + "learning_rate": 3.888419273034658e-05, + "loss": 1.1139, + "step": 920, + "task_loss": 1.017325758934021 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.9628031253814697, + "epoch": 0.78, + "learning_rate": 3.892645815722739e-05, + "loss": 1.0536, + "step": 921, + "task_loss": 1.054599642753601 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6038956046104431, + "epoch": 0.78, + "learning_rate": 3.89687235841082e-05, + "loss": 0.8938, + "step": 922, + "task_loss": 0.16306006908416748 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.010401725769043, + "epoch": 0.78, + "learning_rate": 3.901098901098901e-05, + "loss": 0.8673, + "step": 923, + "task_loss": 1.7269126176834106 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.8716728687286377, + "epoch": 0.78, + "learning_rate": 3.905325443786982e-05, + "loss": 1.1106, + "step": 924, + "task_loss": 1.7616655826568604 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.9565305709838867, + "epoch": 0.78, + "learning_rate": 3.909551986475064e-05, + "loss": 1.3859, + "step": 925, + "task_loss": 1.3713593482971191 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5824107527732849, + "epoch": 0.78, + "learning_rate": 3.913778529163145e-05, + "loss": 0.724, + "step": 926, + "task_loss": 0.6481862664222717 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.4840768575668335, + "epoch": 0.78, + "learning_rate": 3.9180050718512256e-05, + "loss": 0.8968, + "step": 927, + "task_loss": 0.5967175960540771 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.0531705617904663, + "epoch": 0.78, + "learning_rate": 3.9222316145393066e-05, + "loss": 1.0526, + "step": 928, + "task_loss": 0.5348222255706787 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.0892040729522705, + "epoch": 0.78, + "learning_rate": 3.926458157227388e-05, + "loss": 0.9979, + "step": 929, + "task_loss": 0.9812852740287781 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6578410863876343, + "epoch": 0.79, + "learning_rate": 3.93068469991547e-05, + "loss": 0.983, + "step": 930, + "task_loss": 1.4472873210906982 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7527291774749756, + "epoch": 0.79, + "learning_rate": 3.934911242603551e-05, + "loss": 0.8333, + "step": 931, + "task_loss": 0.7980977892875671 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.47932684421539307, + "epoch": 0.79, + "learning_rate": 3.939137785291632e-05, + "loss": 0.6831, + "step": 932, + "task_loss": 0.29740583896636963 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.3974848985671997, + "epoch": 0.79, + "learning_rate": 3.9433643279797126e-05, + "loss": 1.1262, + "step": 933, + "task_loss": 1.995893955230713 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.8135479688644409, + "epoch": 0.79, + "learning_rate": 3.9475908706677936e-05, + "loss": 0.8664, + "step": 934, + "task_loss": 1.0040594339370728 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.2828150987625122, + "epoch": 0.79, + "learning_rate": 3.951817413355875e-05, + "loss": 1.0743, + "step": 935, + "task_loss": 0.8203858137130737 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7958099842071533, + "epoch": 0.79, + "learning_rate": 3.956043956043956e-05, + "loss": 0.739, + "step": 936, + "task_loss": 0.23765406012535095 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.710662841796875, + "epoch": 0.79, + "learning_rate": 3.960270498732037e-05, + "loss": 0.9676, + "step": 937, + "task_loss": 0.5433305501937866 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.8049070835113525, + "epoch": 0.79, + "learning_rate": 3.964497041420119e-05, + "loss": 0.8704, + "step": 938, + "task_loss": 0.8373986482620239 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.930149495601654, + "epoch": 0.79, + "learning_rate": 3.9687235841081996e-05, + "loss": 0.8924, + "step": 939, + "task_loss": 0.4963827431201935 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.9112241864204407, + "epoch": 0.79, + "learning_rate": 3.972950126796281e-05, + "loss": 1.0633, + "step": 940, + "task_loss": 1.2500132322311401 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5710884928703308, + "epoch": 0.79, + "learning_rate": 3.977176669484362e-05, + "loss": 0.8388, + "step": 941, + "task_loss": 0.8553664088249207 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7972924113273621, + "epoch": 0.8, + "learning_rate": 3.981403212172443e-05, + "loss": 0.87, + "step": 942, + "task_loss": 0.7139917612075806 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.9264711141586304, + "epoch": 0.8, + "learning_rate": 3.985629754860524e-05, + "loss": 1.009, + "step": 943, + "task_loss": 0.2500097453594208 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.2713966369628906, + "epoch": 0.8, + "learning_rate": 3.989856297548606e-05, + "loss": 0.9165, + "step": 944, + "task_loss": 0.8897305727005005 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.965168297290802, + "epoch": 0.8, + "learning_rate": 3.9940828402366866e-05, + "loss": 1.0089, + "step": 945, + "task_loss": 0.8689515590667725 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.0661712884902954, + "epoch": 0.8, + "learning_rate": 3.9983093829247675e-05, + "loss": 0.7454, + "step": 946, + "task_loss": 0.8283068537712097 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7944927215576172, + "epoch": 0.8, + "learning_rate": 4.0025359256128485e-05, + "loss": 0.8275, + "step": 947, + "task_loss": 0.8048862218856812 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5694467425346375, + "epoch": 0.8, + "learning_rate": 4.00676246830093e-05, + "loss": 0.8465, + "step": 948, + "task_loss": 1.6677870750427246 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7159451842308044, + "epoch": 0.8, + "learning_rate": 4.010989010989011e-05, + "loss": 0.9979, + "step": 949, + "task_loss": 0.721280574798584 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.1846003532409668, + "epoch": 0.8, + "learning_rate": 4.0152155536770927e-05, + "loss": 0.9007, + "step": 950, + "task_loss": 0.31311145424842834 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7975897192955017, + "epoch": 0.8, + "learning_rate": 4.0194420963651736e-05, + "loss": 0.7006, + "step": 951, + "task_loss": 0.6168971657752991 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.1540274620056152, + "epoch": 0.8, + "learning_rate": 4.0236686390532545e-05, + "loss": 1.0858, + "step": 952, + "task_loss": 1.2207467555999756 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5513806343078613, + "epoch": 0.81, + "learning_rate": 4.0278951817413355e-05, + "loss": 0.6808, + "step": 953, + "task_loss": 0.6391775012016296 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.8955473899841309, + "epoch": 0.81, + "learning_rate": 4.032121724429417e-05, + "loss": 0.9434, + "step": 954, + "task_loss": 1.462203025817871 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7297910451889038, + "epoch": 0.81, + "learning_rate": 4.036348267117498e-05, + "loss": 0.8184, + "step": 955, + "task_loss": 0.5925722122192383 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.8842399716377258, + "epoch": 0.81, + "learning_rate": 4.040574809805579e-05, + "loss": 0.9015, + "step": 956, + "task_loss": 0.2413739711046219 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.1322941780090332, + "epoch": 0.81, + "learning_rate": 4.0448013524936606e-05, + "loss": 0.9982, + "step": 957, + "task_loss": 1.266666054725647 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.9506944417953491, + "epoch": 0.81, + "learning_rate": 4.0490278951817415e-05, + "loss": 0.8833, + "step": 958, + "task_loss": 1.259725570678711 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7270945906639099, + "epoch": 0.81, + "learning_rate": 4.053254437869823e-05, + "loss": 0.8987, + "step": 959, + "task_loss": 0.7694357633590698 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.8349928855895996, + "epoch": 0.81, + "learning_rate": 4.057480980557904e-05, + "loss": 1.0189, + "step": 960, + "task_loss": 0.9906123876571655 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.977832019329071, + "epoch": 0.81, + "learning_rate": 4.061707523245985e-05, + "loss": 0.8727, + "step": 961, + "task_loss": 0.6111451387405396 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.3641244173049927, + "epoch": 0.81, + "learning_rate": 4.065934065934066e-05, + "loss": 1.0169, + "step": 962, + "task_loss": 1.0062824487686157 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7471967935562134, + "epoch": 0.81, + "learning_rate": 4.070160608622147e-05, + "loss": 1.068, + "step": 963, + "task_loss": 1.2559819221496582 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.9855952262878418, + "epoch": 0.81, + "learning_rate": 4.0743871513102285e-05, + "loss": 0.9983, + "step": 964, + "task_loss": 0.24628500640392303 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.4411020278930664, + "epoch": 0.82, + "learning_rate": 4.0786136939983095e-05, + "loss": 0.9164, + "step": 965, + "task_loss": 1.339667797088623 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.022794485092163, + "epoch": 0.82, + "learning_rate": 4.0828402366863904e-05, + "loss": 0.823, + "step": 966, + "task_loss": 1.9382131099700928 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.9341446757316589, + "epoch": 0.82, + "learning_rate": 4.087066779374472e-05, + "loss": 1.0929, + "step": 967, + "task_loss": 1.0955976247787476 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.8919326066970825, + "epoch": 0.82, + "learning_rate": 4.091293322062553e-05, + "loss": 0.996, + "step": 968, + "task_loss": 0.5159933567047119 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.0778310298919678, + "epoch": 0.82, + "learning_rate": 4.0955198647506346e-05, + "loss": 0.9628, + "step": 969, + "task_loss": 1.631089210510254 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.646014928817749, + "epoch": 0.82, + "learning_rate": 4.0997464074387155e-05, + "loss": 0.7177, + "step": 970, + "task_loss": 0.848716139793396 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.737389087677002, + "epoch": 0.82, + "learning_rate": 4.1039729501267964e-05, + "loss": 1.2569, + "step": 971, + "task_loss": 1.0369161367416382 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7804964184761047, + "epoch": 0.82, + "learning_rate": 4.1081994928148774e-05, + "loss": 0.7256, + "step": 972, + "task_loss": 0.5092301964759827 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.158642053604126, + "epoch": 0.82, + "learning_rate": 4.112426035502959e-05, + "loss": 0.9623, + "step": 973, + "task_loss": 0.7963719367980957 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.00654137134552, + "epoch": 0.82, + "learning_rate": 4.11665257819104e-05, + "loss": 0.8913, + "step": 974, + "task_loss": 0.6699679493904114 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6577127575874329, + "epoch": 0.82, + "learning_rate": 4.120879120879121e-05, + "loss": 0.7402, + "step": 975, + "task_loss": 0.4661967158317566 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.9003466367721558, + "epoch": 0.82, + "learning_rate": 4.125105663567202e-05, + "loss": 0.9758, + "step": 976, + "task_loss": 0.6604939699172974 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.0959017276763916, + "epoch": 0.83, + "learning_rate": 4.1293322062552834e-05, + "loss": 0.9385, + "step": 977, + "task_loss": 1.63467538356781 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.8077751398086548, + "epoch": 0.83, + "learning_rate": 4.1335587489433644e-05, + "loss": 0.6735, + "step": 978, + "task_loss": 0.8219801187515259 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.8586176633834839, + "epoch": 0.83, + "learning_rate": 4.137785291631446e-05, + "loss": 0.8983, + "step": 979, + "task_loss": 0.4462464153766632 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.9896467924118042, + "epoch": 0.83, + "learning_rate": 4.142011834319527e-05, + "loss": 0.954, + "step": 980, + "task_loss": 1.3744866847991943 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.0606168508529663, + "epoch": 0.83, + "learning_rate": 4.146238377007608e-05, + "loss": 1.1461, + "step": 981, + "task_loss": 1.6571584939956665 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.8513714075088501, + "epoch": 0.83, + "learning_rate": 4.150464919695689e-05, + "loss": 0.8889, + "step": 982, + "task_loss": 0.3037635385990143 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.994424045085907, + "epoch": 0.83, + "learning_rate": 4.1546914623837704e-05, + "loss": 0.9041, + "step": 983, + "task_loss": 1.0255382061004639 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6671987771987915, + "epoch": 0.83, + "learning_rate": 4.1589180050718514e-05, + "loss": 1.1117, + "step": 984, + "task_loss": 1.1333987712860107 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7842681407928467, + "epoch": 0.83, + "learning_rate": 4.163144547759932e-05, + "loss": 0.7954, + "step": 985, + "task_loss": 1.2768142223358154 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.8951166272163391, + "epoch": 0.83, + "learning_rate": 4.167371090448014e-05, + "loss": 0.9686, + "step": 986, + "task_loss": 1.4309160709381104 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.2169384956359863, + "epoch": 0.83, + "learning_rate": 4.171597633136095e-05, + "loss": 1.0158, + "step": 987, + "task_loss": 1.3494470119476318 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.4003479480743408, + "epoch": 0.83, + "learning_rate": 4.1758241758241765e-05, + "loss": 1.125, + "step": 988, + "task_loss": 1.1926672458648682 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.8295202255249023, + "epoch": 0.84, + "learning_rate": 4.1800507185122574e-05, + "loss": 0.7569, + "step": 989, + "task_loss": 1.34328293800354 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.1668071746826172, + "epoch": 0.84, + "learning_rate": 4.1842772612003383e-05, + "loss": 0.9298, + "step": 990, + "task_loss": 1.9495867490768433 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5928083658218384, + "epoch": 0.84, + "learning_rate": 4.188503803888419e-05, + "loss": 0.9434, + "step": 991, + "task_loss": 1.0725996494293213 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5244714021682739, + "epoch": 0.84, + "learning_rate": 4.1927303465765e-05, + "loss": 0.632, + "step": 992, + "task_loss": 1.2524352073669434 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.2880223989486694, + "epoch": 0.84, + "learning_rate": 4.196956889264582e-05, + "loss": 0.9899, + "step": 993, + "task_loss": 0.8505713939666748 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.0303956270217896, + "epoch": 0.84, + "learning_rate": 4.201183431952663e-05, + "loss": 0.9589, + "step": 994, + "task_loss": 1.6729624271392822 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.8796193599700928, + "epoch": 0.84, + "learning_rate": 4.205409974640744e-05, + "loss": 0.8152, + "step": 995, + "task_loss": 0.8178637623786926 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.541543185710907, + "epoch": 0.84, + "learning_rate": 4.209636517328825e-05, + "loss": 0.7942, + "step": 996, + "task_loss": 0.6259867548942566 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.8939079642295837, + "epoch": 0.84, + "learning_rate": 4.213863060016906e-05, + "loss": 0.9016, + "step": 997, + "task_loss": 1.4589275121688843 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.916896641254425, + "epoch": 0.84, + "learning_rate": 4.218089602704988e-05, + "loss": 0.8484, + "step": 998, + "task_loss": 1.7455095052719116 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.2857661247253418, + "epoch": 0.84, + "learning_rate": 4.222316145393069e-05, + "loss": 0.9986, + "step": 999, + "task_loss": 1.3231134414672852 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5655268430709839, + "epoch": 0.84, + "learning_rate": 4.22654268808115e-05, + "loss": 0.6823, + "step": 1000, + "task_loss": 0.7202645540237427 + }, + { + "epoch": 0.84, + "eval_accuracy": 0.8818217821782178, + "eval_loss": 0.5126578211784363, + "eval_runtime": 229.1486, + "eval_samples_per_second": 110.191, + "eval_steps_per_second": 0.864, + "step": 1000 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.647757887840271, + "epoch": 0.85, + "learning_rate": 4.230769230769231e-05, + "loss": 0.7882, + "step": 1001, + "task_loss": 0.5003728270530701 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.466987669467926, + "epoch": 0.85, + "learning_rate": 4.234995773457312e-05, + "loss": 0.7159, + "step": 1002, + "task_loss": 0.5549584031105042 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.8620222806930542, + "epoch": 0.85, + "learning_rate": 4.239222316145393e-05, + "loss": 0.8545, + "step": 1003, + "task_loss": 0.8501796722412109 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.9268772602081299, + "epoch": 0.85, + "learning_rate": 4.243448858833474e-05, + "loss": 0.8878, + "step": 1004, + "task_loss": 0.720402717590332 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.7705105543136597, + "epoch": 0.85, + "learning_rate": 4.247675401521555e-05, + "loss": 1.0673, + "step": 1005, + "task_loss": 2.286893606185913 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.1945281028747559, + "epoch": 0.85, + "learning_rate": 4.251901944209637e-05, + "loss": 0.882, + "step": 1006, + "task_loss": 0.22540704905986786 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.8362153768539429, + "epoch": 0.85, + "learning_rate": 4.256128486897718e-05, + "loss": 0.7769, + "step": 1007, + "task_loss": 0.527235746383667 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.8700671195983887, + "epoch": 0.85, + "learning_rate": 4.260355029585799e-05, + "loss": 0.8316, + "step": 1008, + "task_loss": 0.9892043471336365 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.5354052782058716, + "epoch": 0.85, + "learning_rate": 4.26458157227388e-05, + "loss": 1.0411, + "step": 1009, + "task_loss": 1.6685892343521118 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7675468921661377, + "epoch": 0.85, + "learning_rate": 4.268808114961961e-05, + "loss": 0.9394, + "step": 1010, + "task_loss": 1.3933601379394531 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.1413772106170654, + "epoch": 0.85, + "learning_rate": 4.273034657650042e-05, + "loss": 1.0169, + "step": 1011, + "task_loss": 1.3889970779418945 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7557093501091003, + "epoch": 0.85, + "learning_rate": 4.277261200338124e-05, + "loss": 0.8941, + "step": 1012, + "task_loss": 0.30902594327926636 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5545022487640381, + "epoch": 0.86, + "learning_rate": 4.281487743026205e-05, + "loss": 0.6347, + "step": 1013, + "task_loss": 0.36918920278549194 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.9747594594955444, + "epoch": 0.86, + "learning_rate": 4.2857142857142856e-05, + "loss": 0.906, + "step": 1014, + "task_loss": 0.7703030109405518 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6983894109725952, + "epoch": 0.86, + "learning_rate": 4.289940828402367e-05, + "loss": 0.7093, + "step": 1015, + "task_loss": 0.9726892113685608 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6856435537338257, + "epoch": 0.86, + "learning_rate": 4.294167371090448e-05, + "loss": 0.794, + "step": 1016, + "task_loss": 0.7919455170631409 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6856673955917358, + "epoch": 0.86, + "learning_rate": 4.29839391377853e-05, + "loss": 0.9054, + "step": 1017, + "task_loss": 0.4258291721343994 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.8028672933578491, + "epoch": 0.86, + "learning_rate": 4.302620456466611e-05, + "loss": 0.7592, + "step": 1018, + "task_loss": 1.0734472274780273 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.469557523727417, + "epoch": 0.86, + "learning_rate": 4.306846999154692e-05, + "loss": 1.0664, + "step": 1019, + "task_loss": 1.4430866241455078 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.8346530795097351, + "epoch": 0.86, + "learning_rate": 4.3110735418427726e-05, + "loss": 0.6785, + "step": 1020, + "task_loss": 1.2720863819122314 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6673634052276611, + "epoch": 0.86, + "learning_rate": 4.3153000845308536e-05, + "loss": 1.1122, + "step": 1021, + "task_loss": 0.1406627744436264 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6353693604469299, + "epoch": 0.86, + "learning_rate": 4.319526627218935e-05, + "loss": 0.8348, + "step": 1022, + "task_loss": 0.12585875391960144 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7626101970672607, + "epoch": 0.86, + "learning_rate": 4.323753169907016e-05, + "loss": 0.7084, + "step": 1023, + "task_loss": 1.0916800498962402 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.8243899345397949, + "epoch": 0.87, + "learning_rate": 4.327979712595097e-05, + "loss": 0.9452, + "step": 1024, + "task_loss": 0.2818711996078491 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7899994850158691, + "epoch": 0.87, + "learning_rate": 4.332206255283179e-05, + "loss": 0.7363, + "step": 1025, + "task_loss": 0.7793794870376587 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4777749180793762, + "epoch": 0.87, + "learning_rate": 4.3364327979712596e-05, + "loss": 0.8622, + "step": 1026, + "task_loss": 0.023271748796105385 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7985219955444336, + "epoch": 0.87, + "learning_rate": 4.340659340659341e-05, + "loss": 0.9553, + "step": 1027, + "task_loss": 0.6418771147727966 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.9799319505691528, + "epoch": 0.87, + "learning_rate": 4.344885883347422e-05, + "loss": 1.0448, + "step": 1028, + "task_loss": 1.342651605606079 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5869677662849426, + "epoch": 0.87, + "learning_rate": 4.349112426035503e-05, + "loss": 0.7644, + "step": 1029, + "task_loss": 0.8527691960334778 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.2063424587249756, + "epoch": 0.87, + "learning_rate": 4.353338968723584e-05, + "loss": 0.8745, + "step": 1030, + "task_loss": 1.6588696241378784 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.0741658210754395, + "epoch": 0.87, + "learning_rate": 4.3575655114116657e-05, + "loss": 0.9719, + "step": 1031, + "task_loss": 0.6360454559326172 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.4267752170562744, + "epoch": 0.87, + "learning_rate": 4.3617920540997466e-05, + "loss": 0.9083, + "step": 1032, + "task_loss": 0.5487968325614929 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5794072151184082, + "epoch": 0.87, + "learning_rate": 4.3660185967878275e-05, + "loss": 1.0125, + "step": 1033, + "task_loss": 0.320943146944046 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.1888625621795654, + "epoch": 0.87, + "learning_rate": 4.370245139475909e-05, + "loss": 0.9984, + "step": 1034, + "task_loss": 1.572576642036438 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5843337178230286, + "epoch": 0.87, + "learning_rate": 4.37447168216399e-05, + "loss": 0.737, + "step": 1035, + "task_loss": 1.1464805603027344 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.1899930238723755, + "epoch": 0.88, + "learning_rate": 4.378698224852072e-05, + "loss": 1.0306, + "step": 1036, + "task_loss": 0.8967394828796387 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7702544927597046, + "epoch": 0.88, + "learning_rate": 4.3829247675401526e-05, + "loss": 0.8795, + "step": 1037, + "task_loss": 1.5231029987335205 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7108954191207886, + "epoch": 0.88, + "learning_rate": 4.3871513102282336e-05, + "loss": 0.8794, + "step": 1038, + "task_loss": 0.6973483562469482 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.8070660829544067, + "epoch": 0.88, + "learning_rate": 4.3913778529163145e-05, + "loss": 0.8355, + "step": 1039, + "task_loss": 0.4528863728046417 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.891578197479248, + "epoch": 0.88, + "learning_rate": 4.3956043956043955e-05, + "loss": 0.9294, + "step": 1040, + "task_loss": 0.7368393540382385 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.9262855052947998, + "epoch": 0.88, + "learning_rate": 4.399830938292477e-05, + "loss": 1.1193, + "step": 1041, + "task_loss": 0.9151571989059448 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6654446125030518, + "epoch": 0.88, + "learning_rate": 4.404057480980558e-05, + "loss": 0.6845, + "step": 1042, + "task_loss": 1.1652885675430298 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.917605459690094, + "epoch": 0.88, + "learning_rate": 4.408284023668639e-05, + "loss": 0.9026, + "step": 1043, + "task_loss": 0.699445903301239 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6230236887931824, + "epoch": 0.88, + "learning_rate": 4.4125105663567206e-05, + "loss": 0.8428, + "step": 1044, + "task_loss": 0.5558046102523804 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5473418235778809, + "epoch": 0.88, + "learning_rate": 4.4167371090448015e-05, + "loss": 0.935, + "step": 1045, + "task_loss": 0.723911464214325 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.5218133926391602, + "epoch": 0.88, + "learning_rate": 4.420963651732883e-05, + "loss": 1.1784, + "step": 1046, + "task_loss": 1.442440152168274 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.523809015750885, + "epoch": 0.88, + "learning_rate": 4.425190194420964e-05, + "loss": 1.159, + "step": 1047, + "task_loss": 0.2729208171367645 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5910937786102295, + "epoch": 0.89, + "learning_rate": 4.429416737109045e-05, + "loss": 0.8459, + "step": 1048, + "task_loss": 0.7357900738716125 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6066716909408569, + "epoch": 0.89, + "learning_rate": 4.433643279797126e-05, + "loss": 0.8672, + "step": 1049, + "task_loss": 0.8049559593200684 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7794140577316284, + "epoch": 0.89, + "learning_rate": 4.437869822485207e-05, + "loss": 0.9468, + "step": 1050, + "task_loss": 0.7368866205215454 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5288619995117188, + "epoch": 0.89, + "learning_rate": 4.4420963651732885e-05, + "loss": 0.7537, + "step": 1051, + "task_loss": 0.6331936717033386 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.1077601909637451, + "epoch": 0.89, + "learning_rate": 4.4463229078613694e-05, + "loss": 0.96, + "step": 1052, + "task_loss": 0.5554484128952026 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.9439190626144409, + "epoch": 0.89, + "learning_rate": 4.4505494505494504e-05, + "loss": 0.7444, + "step": 1053, + "task_loss": 1.0459246635437012 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.8643949031829834, + "epoch": 0.89, + "learning_rate": 4.454775993237532e-05, + "loss": 0.9363, + "step": 1054, + "task_loss": 1.070670485496521 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.43990471959114075, + "epoch": 0.89, + "learning_rate": 4.459002535925613e-05, + "loss": 0.7478, + "step": 1055, + "task_loss": 0.27512404322624207 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.2374026775360107, + "epoch": 0.89, + "learning_rate": 4.4632290786136946e-05, + "loss": 0.9227, + "step": 1056, + "task_loss": 1.3580701351165771 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 2.390275716781616, + "epoch": 0.89, + "learning_rate": 4.4674556213017755e-05, + "loss": 1.0513, + "step": 1057, + "task_loss": 1.2459529638290405 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.0342093706130981, + "epoch": 0.89, + "learning_rate": 4.4716821639898564e-05, + "loss": 0.8001, + "step": 1058, + "task_loss": 1.5110549926757812 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4645204246044159, + "epoch": 0.89, + "learning_rate": 4.4759087066779374e-05, + "loss": 0.7747, + "step": 1059, + "task_loss": 0.2645789384841919 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.2553523778915405, + "epoch": 0.9, + "learning_rate": 4.480135249366019e-05, + "loss": 0.9667, + "step": 1060, + "task_loss": 1.6067612171173096 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5791783928871155, + "epoch": 0.9, + "learning_rate": 4.4843617920541e-05, + "loss": 0.8263, + "step": 1061, + "task_loss": 0.5787603259086609 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.1135956048965454, + "epoch": 0.9, + "learning_rate": 4.488588334742181e-05, + "loss": 0.9713, + "step": 1062, + "task_loss": 1.0425268411636353 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.0000865459442139, + "epoch": 0.9, + "learning_rate": 4.4928148774302625e-05, + "loss": 0.822, + "step": 1063, + "task_loss": 0.6935333013534546 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.6616714000701904, + "epoch": 0.9, + "learning_rate": 4.4970414201183434e-05, + "loss": 1.0247, + "step": 1064, + "task_loss": 1.922581434249878 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.9025981426239014, + "epoch": 0.9, + "learning_rate": 4.501267962806425e-05, + "loss": 0.902, + "step": 1065, + "task_loss": 0.7872021198272705 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.1003330945968628, + "epoch": 0.9, + "learning_rate": 4.505494505494506e-05, + "loss": 1.0747, + "step": 1066, + "task_loss": 0.3769143223762512 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5716548562049866, + "epoch": 0.9, + "learning_rate": 4.509721048182587e-05, + "loss": 0.8985, + "step": 1067, + "task_loss": 0.5844125151634216 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.0084329843521118, + "epoch": 0.9, + "learning_rate": 4.513947590870668e-05, + "loss": 0.7896, + "step": 1068, + "task_loss": 0.9215744733810425 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4554329812526703, + "epoch": 0.9, + "learning_rate": 4.518174133558749e-05, + "loss": 0.6577, + "step": 1069, + "task_loss": 0.7598364353179932 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.9826257824897766, + "epoch": 0.9, + "learning_rate": 4.5224006762468304e-05, + "loss": 0.8512, + "step": 1070, + "task_loss": 0.9104565382003784 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.8501220345497131, + "epoch": 0.9, + "learning_rate": 4.5266272189349114e-05, + "loss": 0.8118, + "step": 1071, + "task_loss": 0.7907831072807312 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.846510112285614, + "epoch": 0.91, + "learning_rate": 4.530853761622992e-05, + "loss": 0.9324, + "step": 1072, + "task_loss": 1.0277478694915771 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7719987630844116, + "epoch": 0.91, + "learning_rate": 4.535080304311074e-05, + "loss": 0.912, + "step": 1073, + "task_loss": 0.6699872612953186 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.2992414236068726, + "epoch": 0.91, + "learning_rate": 4.539306846999155e-05, + "loss": 0.7656, + "step": 1074, + "task_loss": 0.9048945307731628 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.8488128781318665, + "epoch": 0.91, + "learning_rate": 4.5435333896872365e-05, + "loss": 0.9368, + "step": 1075, + "task_loss": 0.5284009575843811 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7792898416519165, + "epoch": 0.91, + "learning_rate": 4.5477599323753174e-05, + "loss": 1.1859, + "step": 1076, + "task_loss": 0.6506417989730835 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.8472839593887329, + "epoch": 0.91, + "learning_rate": 4.5519864750633983e-05, + "loss": 0.9629, + "step": 1077, + "task_loss": 1.2173994779586792 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.46597281098365784, + "epoch": 0.91, + "learning_rate": 4.556213017751479e-05, + "loss": 0.6591, + "step": 1078, + "task_loss": 0.7137371897697449 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6434260010719299, + "epoch": 0.91, + "learning_rate": 4.56043956043956e-05, + "loss": 0.853, + "step": 1079, + "task_loss": 0.5525861978530884 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7839975953102112, + "epoch": 0.91, + "learning_rate": 4.564666103127642e-05, + "loss": 0.998, + "step": 1080, + "task_loss": 0.9719991683959961 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.0682356357574463, + "epoch": 0.91, + "learning_rate": 4.568892645815723e-05, + "loss": 1.059, + "step": 1081, + "task_loss": 1.2033246755599976 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.0434798002243042, + "epoch": 0.91, + "learning_rate": 4.573119188503804e-05, + "loss": 0.9648, + "step": 1082, + "task_loss": 1.0331995487213135 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5535196661949158, + "epoch": 0.91, + "learning_rate": 4.577345731191885e-05, + "loss": 0.8531, + "step": 1083, + "task_loss": 0.4642697870731354 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7953972220420837, + "epoch": 0.92, + "learning_rate": 4.581572273879966e-05, + "loss": 1.0091, + "step": 1084, + "task_loss": 0.6886611580848694 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6668606400489807, + "epoch": 0.92, + "learning_rate": 4.585798816568048e-05, + "loss": 0.9639, + "step": 1085, + "task_loss": 0.9323781728744507 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6567552089691162, + "epoch": 0.92, + "learning_rate": 4.590025359256129e-05, + "loss": 0.6679, + "step": 1086, + "task_loss": 1.1746405363082886 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.3561595678329468, + "epoch": 0.92, + "learning_rate": 4.59425190194421e-05, + "loss": 0.9204, + "step": 1087, + "task_loss": 1.1490095853805542 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.9540305137634277, + "epoch": 0.92, + "learning_rate": 4.598478444632291e-05, + "loss": 0.9032, + "step": 1088, + "task_loss": 0.7797126173973083 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.8380683660507202, + "epoch": 0.92, + "learning_rate": 4.602704987320372e-05, + "loss": 0.7912, + "step": 1089, + "task_loss": 0.8965733647346497 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.162031650543213, + "epoch": 0.92, + "learning_rate": 4.606931530008453e-05, + "loss": 0.7823, + "step": 1090, + "task_loss": 1.0961861610412598 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6758277416229248, + "epoch": 0.92, + "learning_rate": 4.611158072696534e-05, + "loss": 0.9393, + "step": 1091, + "task_loss": 1.8535586595535278 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.8472609519958496, + "epoch": 0.92, + "learning_rate": 4.615384615384616e-05, + "loss": 0.7982, + "step": 1092, + "task_loss": 0.9317784309387207 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.3202778100967407, + "epoch": 0.92, + "learning_rate": 4.619611158072697e-05, + "loss": 0.9378, + "step": 1093, + "task_loss": 1.2962573766708374 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.0958589315414429, + "epoch": 0.92, + "learning_rate": 4.6238377007607784e-05, + "loss": 0.8867, + "step": 1094, + "task_loss": 0.39049527049064636 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.743079662322998, + "epoch": 0.93, + "learning_rate": 4.628064243448859e-05, + "loss": 0.6614, + "step": 1095, + "task_loss": 0.8939529657363892 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6313505172729492, + "epoch": 0.93, + "learning_rate": 4.63229078613694e-05, + "loss": 0.8634, + "step": 1096, + "task_loss": 0.21062703430652618 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.1305420398712158, + "epoch": 0.93, + "learning_rate": 4.636517328825021e-05, + "loss": 0.9591, + "step": 1097, + "task_loss": 1.2483044862747192 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.773510217666626, + "epoch": 0.93, + "learning_rate": 4.640743871513102e-05, + "loss": 0.6836, + "step": 1098, + "task_loss": 0.8179703950881958 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.9545236825942993, + "epoch": 0.93, + "learning_rate": 4.644970414201184e-05, + "loss": 0.8491, + "step": 1099, + "task_loss": 1.3345659971237183 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.9103188514709473, + "epoch": 0.93, + "learning_rate": 4.649196956889265e-05, + "loss": 0.9469, + "step": 1100, + "task_loss": 0.6113436222076416 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.858731746673584, + "epoch": 0.93, + "learning_rate": 4.6534234995773456e-05, + "loss": 0.9865, + "step": 1101, + "task_loss": 0.45781344175338745 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.1944605112075806, + "epoch": 0.93, + "learning_rate": 4.657650042265427e-05, + "loss": 0.865, + "step": 1102, + "task_loss": 0.811022162437439 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5870804786682129, + "epoch": 0.93, + "learning_rate": 4.661876584953508e-05, + "loss": 0.6043, + "step": 1103, + "task_loss": 0.07904958724975586 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.266842007637024, + "epoch": 0.93, + "learning_rate": 4.66610312764159e-05, + "loss": 0.9609, + "step": 1104, + "task_loss": 1.3891403675079346 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.146797776222229, + "epoch": 0.93, + "learning_rate": 4.670329670329671e-05, + "loss": 0.9449, + "step": 1105, + "task_loss": 1.8598222732543945 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.4028728008270264, + "epoch": 0.93, + "learning_rate": 4.674556213017752e-05, + "loss": 0.9866, + "step": 1106, + "task_loss": 1.2189751863479614 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7495356798171997, + "epoch": 0.94, + "learning_rate": 4.6787827557058326e-05, + "loss": 0.9477, + "step": 1107, + "task_loss": 0.49210137128829956 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.3993257284164429, + "epoch": 0.94, + "learning_rate": 4.683009298393914e-05, + "loss": 0.9121, + "step": 1108, + "task_loss": 1.292222499847412 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5882537961006165, + "epoch": 0.94, + "learning_rate": 4.687235841081995e-05, + "loss": 0.8171, + "step": 1109, + "task_loss": 0.4914180338382721 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5512704253196716, + "epoch": 0.94, + "learning_rate": 4.691462383770076e-05, + "loss": 0.592, + "step": 1110, + "task_loss": 0.6743378043174744 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.005277156829834, + "epoch": 0.94, + "learning_rate": 4.695688926458158e-05, + "loss": 0.9265, + "step": 1111, + "task_loss": 0.5963150858879089 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7325995564460754, + "epoch": 0.94, + "learning_rate": 4.6999154691462387e-05, + "loss": 0.8939, + "step": 1112, + "task_loss": 1.3621599674224854 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.973212480545044, + "epoch": 0.94, + "learning_rate": 4.7041420118343196e-05, + "loss": 0.9879, + "step": 1113, + "task_loss": 0.654776930809021 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.0871949195861816, + "epoch": 0.94, + "learning_rate": 4.708368554522401e-05, + "loss": 1.1087, + "step": 1114, + "task_loss": 0.9001259803771973 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4848073720932007, + "epoch": 0.94, + "learning_rate": 4.712595097210482e-05, + "loss": 0.6826, + "step": 1115, + "task_loss": 0.11071043461561203 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.666934072971344, + "epoch": 0.94, + "learning_rate": 4.716821639898563e-05, + "loss": 0.6637, + "step": 1116, + "task_loss": 0.8432983160018921 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.822002112865448, + "epoch": 0.94, + "learning_rate": 4.721048182586644e-05, + "loss": 0.8006, + "step": 1117, + "task_loss": 0.7242121696472168 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.9545989036560059, + "epoch": 0.94, + "learning_rate": 4.7252747252747257e-05, + "loss": 0.715, + "step": 1118, + "task_loss": 0.7143362164497375 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5905972719192505, + "epoch": 0.95, + "learning_rate": 4.7295012679628066e-05, + "loss": 0.7469, + "step": 1119, + "task_loss": 0.33989161252975464 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7058432698249817, + "epoch": 0.95, + "learning_rate": 4.7337278106508875e-05, + "loss": 0.6912, + "step": 1120, + "task_loss": 0.36675822734832764 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.8441854119300842, + "epoch": 0.95, + "learning_rate": 4.737954353338969e-05, + "loss": 0.6749, + "step": 1121, + "task_loss": 0.8376095294952393 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5611559152603149, + "epoch": 0.95, + "learning_rate": 4.74218089602705e-05, + "loss": 0.87, + "step": 1122, + "task_loss": 0.19328813254833221 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.9309321045875549, + "epoch": 0.95, + "learning_rate": 4.746407438715132e-05, + "loss": 0.6955, + "step": 1123, + "task_loss": 0.7284340858459473 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.8362783193588257, + "epoch": 0.95, + "learning_rate": 4.7506339814032126e-05, + "loss": 0.8261, + "step": 1124, + "task_loss": 0.6988627314567566 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5206829309463501, + "epoch": 0.95, + "learning_rate": 4.7548605240912936e-05, + "loss": 0.6633, + "step": 1125, + "task_loss": 0.3773687779903412 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7864575982093811, + "epoch": 0.95, + "learning_rate": 4.7590870667793745e-05, + "loss": 0.9133, + "step": 1126, + "task_loss": 1.050910234451294 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.12627112865448, + "epoch": 0.95, + "learning_rate": 4.7633136094674555e-05, + "loss": 0.831, + "step": 1127, + "task_loss": 1.7597535848617554 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6861032247543335, + "epoch": 0.95, + "learning_rate": 4.767540152155537e-05, + "loss": 0.9403, + "step": 1128, + "task_loss": 1.1008251905441284 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.9799793362617493, + "epoch": 0.95, + "learning_rate": 4.771766694843618e-05, + "loss": 0.7904, + "step": 1129, + "task_loss": 1.340074062347412 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.9191635847091675, + "epoch": 0.95, + "learning_rate": 4.775993237531699e-05, + "loss": 0.7899, + "step": 1130, + "task_loss": 1.689366340637207 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3684549331665039, + "epoch": 0.96, + "learning_rate": 4.7802197802197806e-05, + "loss": 0.635, + "step": 1131, + "task_loss": 0.11720118671655655 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6864452362060547, + "epoch": 0.96, + "learning_rate": 4.7844463229078615e-05, + "loss": 0.9507, + "step": 1132, + "task_loss": 0.3467939794063568 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5363916754722595, + "epoch": 0.96, + "learning_rate": 4.788672865595943e-05, + "loss": 0.8919, + "step": 1133, + "task_loss": 1.0830916166305542 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7095955610275269, + "epoch": 0.96, + "learning_rate": 4.792899408284024e-05, + "loss": 0.8856, + "step": 1134, + "task_loss": 0.47506776452064514 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.1227432489395142, + "epoch": 0.96, + "learning_rate": 4.797125950972105e-05, + "loss": 1.0267, + "step": 1135, + "task_loss": 1.6771633625030518 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.8548144102096558, + "epoch": 0.96, + "learning_rate": 4.801352493660186e-05, + "loss": 0.8635, + "step": 1136, + "task_loss": 1.148176670074463 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6977930665016174, + "epoch": 0.96, + "learning_rate": 4.8055790363482676e-05, + "loss": 0.7531, + "step": 1137, + "task_loss": 0.8055868148803711 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.9715834259986877, + "epoch": 0.96, + "learning_rate": 4.8098055790363485e-05, + "loss": 0.7051, + "step": 1138, + "task_loss": 0.6085440516471863 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5230078101158142, + "epoch": 0.96, + "learning_rate": 4.8140321217244294e-05, + "loss": 0.9686, + "step": 1139, + "task_loss": 1.2118300199508667 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.4384773969650269, + "epoch": 0.96, + "learning_rate": 4.818258664412511e-05, + "loss": 1.0679, + "step": 1140, + "task_loss": 0.5830682516098022 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.0432379245758057, + "epoch": 0.96, + "learning_rate": 4.822485207100592e-05, + "loss": 0.8904, + "step": 1141, + "task_loss": 1.2358118295669556 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.752661943435669, + "epoch": 0.96, + "learning_rate": 4.826711749788673e-05, + "loss": 0.8425, + "step": 1142, + "task_loss": 1.4105759859085083 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.66182541847229, + "epoch": 0.97, + "learning_rate": 4.8309382924767545e-05, + "loss": 0.8207, + "step": 1143, + "task_loss": 0.6930851340293884 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.8332805633544922, + "epoch": 0.97, + "learning_rate": 4.8351648351648355e-05, + "loss": 0.7004, + "step": 1144, + "task_loss": 0.6605567336082458 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7746790647506714, + "epoch": 0.97, + "learning_rate": 4.8393913778529164e-05, + "loss": 0.652, + "step": 1145, + "task_loss": 0.5090996026992798 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7387020587921143, + "epoch": 0.97, + "learning_rate": 4.8436179205409974e-05, + "loss": 0.9608, + "step": 1146, + "task_loss": 1.7854914665222168 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4871561527252197, + "epoch": 0.97, + "learning_rate": 4.847844463229079e-05, + "loss": 0.7618, + "step": 1147, + "task_loss": 0.32913532853126526 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7172813415527344, + "epoch": 0.97, + "learning_rate": 4.85207100591716e-05, + "loss": 0.874, + "step": 1148, + "task_loss": 0.5674693584442139 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.047457218170166, + "epoch": 0.97, + "learning_rate": 4.856297548605241e-05, + "loss": 0.7684, + "step": 1149, + "task_loss": 1.448573112487793 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.1701298952102661, + "epoch": 0.97, + "learning_rate": 4.8605240912933225e-05, + "loss": 0.8693, + "step": 1150, + "task_loss": 1.59458327293396 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.786941647529602, + "epoch": 0.97, + "learning_rate": 4.8647506339814034e-05, + "loss": 0.7571, + "step": 1151, + "task_loss": 1.1657963991165161 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.8480064868927002, + "epoch": 0.97, + "learning_rate": 4.868977176669485e-05, + "loss": 0.7054, + "step": 1152, + "task_loss": 0.4873350262641907 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4961494207382202, + "epoch": 0.97, + "learning_rate": 4.873203719357566e-05, + "loss": 0.8035, + "step": 1153, + "task_loss": 0.6741071939468384 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.42167800664901733, + "epoch": 0.97, + "learning_rate": 4.877430262045647e-05, + "loss": 0.7644, + "step": 1154, + "task_loss": 0.5470255017280579 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.9479389190673828, + "epoch": 0.98, + "learning_rate": 4.881656804733728e-05, + "loss": 0.8436, + "step": 1155, + "task_loss": 1.6125448942184448 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6049249172210693, + "epoch": 0.98, + "learning_rate": 4.885883347421809e-05, + "loss": 0.6315, + "step": 1156, + "task_loss": 0.4420894384384155 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5910985469818115, + "epoch": 0.98, + "learning_rate": 4.8901098901098904e-05, + "loss": 0.725, + "step": 1157, + "task_loss": 0.3695553243160248 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6824641227722168, + "epoch": 0.98, + "learning_rate": 4.8943364327979713e-05, + "loss": 0.7071, + "step": 1158, + "task_loss": 0.424686998128891 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5451250672340393, + "epoch": 0.98, + "learning_rate": 4.898562975486053e-05, + "loss": 0.8143, + "step": 1159, + "task_loss": 0.49506431818008423 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6511393189430237, + "epoch": 0.98, + "learning_rate": 4.902789518174134e-05, + "loss": 0.8072, + "step": 1160, + "task_loss": 0.8593818545341492 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6046502590179443, + "epoch": 0.98, + "learning_rate": 4.907016060862215e-05, + "loss": 0.7687, + "step": 1161, + "task_loss": 0.34596338868141174 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.1764739751815796, + "epoch": 0.98, + "learning_rate": 4.9112426035502965e-05, + "loss": 0.9759, + "step": 1162, + "task_loss": 1.2387423515319824 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.9606496095657349, + "epoch": 0.98, + "learning_rate": 4.9154691462383774e-05, + "loss": 0.9379, + "step": 1163, + "task_loss": 1.2935922145843506 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7973233461380005, + "epoch": 0.98, + "learning_rate": 4.919695688926458e-05, + "loss": 0.8229, + "step": 1164, + "task_loss": 1.3531285524368286 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7259988188743591, + "epoch": 0.98, + "learning_rate": 4.923922231614539e-05, + "loss": 0.6551, + "step": 1165, + "task_loss": 0.6615791320800781 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.0015661716461182, + "epoch": 0.99, + "learning_rate": 4.928148774302621e-05, + "loss": 0.9141, + "step": 1166, + "task_loss": 0.7225884199142456 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6484079957008362, + "epoch": 0.99, + "learning_rate": 4.932375316990702e-05, + "loss": 0.702, + "step": 1167, + "task_loss": 0.47099247574806213 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7755736112594604, + "epoch": 0.99, + "learning_rate": 4.936601859678783e-05, + "loss": 0.8777, + "step": 1168, + "task_loss": 1.3386353254318237 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.1050901412963867, + "epoch": 0.99, + "learning_rate": 4.9408284023668644e-05, + "loss": 0.911, + "step": 1169, + "task_loss": 0.9388408660888672 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.0147496461868286, + "epoch": 0.99, + "learning_rate": 4.945054945054945e-05, + "loss": 0.6335, + "step": 1170, + "task_loss": 0.38833218812942505 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6743987798690796, + "epoch": 0.99, + "learning_rate": 4.949281487743026e-05, + "loss": 0.7153, + "step": 1171, + "task_loss": 0.17356200516223907 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.4332040548324585, + "epoch": 0.99, + "learning_rate": 4.953508030431108e-05, + "loss": 0.8204, + "step": 1172, + "task_loss": 1.4597587585449219 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4684329926967621, + "epoch": 0.99, + "learning_rate": 4.957734573119189e-05, + "loss": 0.8475, + "step": 1173, + "task_loss": 0.19058871269226074 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.02614164352417, + "epoch": 0.99, + "learning_rate": 4.96196111580727e-05, + "loss": 0.9345, + "step": 1174, + "task_loss": 1.030394434928894 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.20999276638031, + "epoch": 0.99, + "learning_rate": 4.966187658495351e-05, + "loss": 0.998, + "step": 1175, + "task_loss": 1.2831350564956665 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.1119331121444702, + "epoch": 0.99, + "learning_rate": 4.970414201183432e-05, + "loss": 0.868, + "step": 1176, + "task_loss": 0.5656747221946716 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.580865740776062, + "epoch": 0.99, + "learning_rate": 4.974640743871513e-05, + "loss": 0.6226, + "step": 1177, + "task_loss": 0.8677773475646973 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.4530701637268066, + "epoch": 1.0, + "learning_rate": 4.978867286559594e-05, + "loss": 1.1559, + "step": 1178, + "task_loss": 1.3196793794631958 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6576048731803894, + "epoch": 1.0, + "learning_rate": 4.983093829247676e-05, + "loss": 0.7788, + "step": 1179, + "task_loss": 1.4874781370162964 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6078149080276489, + "epoch": 1.0, + "learning_rate": 4.987320371935757e-05, + "loss": 0.6888, + "step": 1180, + "task_loss": 0.5247536897659302 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.8011260628700256, + "epoch": 1.0, + "learning_rate": 4.9915469146238384e-05, + "loss": 0.8736, + "step": 1181, + "task_loss": 1.6819392442703247 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6642876863479614, + "epoch": 1.0, + "learning_rate": 4.995773457311919e-05, + "loss": 0.8511, + "step": 1182, + "task_loss": 0.22670377790927887 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7785025835037231, + "epoch": 1.0, + "learning_rate": 5e-05, + "loss": 0.8961, + "step": 1183, + "task_loss": 0.6874377727508545 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.0219122171401978, + "epoch": 1.0, + "learning_rate": 4.999530384145769e-05, + "loss": 1.4442, + "step": 1184, + "task_loss": 0.3908257782459259 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6700040102005005, + "epoch": 1.0, + "learning_rate": 4.999060768291538e-05, + "loss": 0.8876, + "step": 1185, + "task_loss": 0.43625307083129883 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7480214834213257, + "epoch": 1.0, + "learning_rate": 4.998591152437306e-05, + "loss": 0.7905, + "step": 1186, + "task_loss": 1.7818138599395752 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.9560362100601196, + "epoch": 1.0, + "learning_rate": 4.9981215365830755e-05, + "loss": 0.8194, + "step": 1187, + "task_loss": 1.0421855449676514 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7198781967163086, + "epoch": 1.0, + "learning_rate": 4.997651920728844e-05, + "loss": 0.8554, + "step": 1188, + "task_loss": 0.8423463106155396 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.4338092803955078, + "epoch": 1.01, + "learning_rate": 4.997182304874613e-05, + "loss": 0.8194, + "step": 1189, + "task_loss": 0.9605714678764343 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7157037258148193, + "epoch": 1.01, + "learning_rate": 4.9967126890203814e-05, + "loss": 0.7299, + "step": 1190, + "task_loss": 0.4957178235054016 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.2886805534362793, + "epoch": 1.01, + "learning_rate": 4.99624307316615e-05, + "loss": 0.8575, + "step": 1191, + "task_loss": 1.5501511096954346 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5618396997451782, + "epoch": 1.01, + "learning_rate": 4.995773457311919e-05, + "loss": 0.6364, + "step": 1192, + "task_loss": 0.4950997829437256 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5813946723937988, + "epoch": 1.01, + "learning_rate": 4.995303841457688e-05, + "loss": 0.7186, + "step": 1193, + "task_loss": 0.4804908335208893 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6791324615478516, + "epoch": 1.01, + "learning_rate": 4.9948342256034566e-05, + "loss": 0.8508, + "step": 1194, + "task_loss": 0.9111616611480713 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.611919641494751, + "epoch": 1.01, + "learning_rate": 4.994364609749225e-05, + "loss": 0.8471, + "step": 1195, + "task_loss": 0.5761310458183289 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.8236588835716248, + "epoch": 1.01, + "learning_rate": 4.993894993894994e-05, + "loss": 0.8317, + "step": 1196, + "task_loss": 0.22680160403251648 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.399686574935913, + "epoch": 1.01, + "learning_rate": 4.993425378040763e-05, + "loss": 0.817, + "step": 1197, + "task_loss": 1.0075557231903076 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5247260928153992, + "epoch": 1.01, + "learning_rate": 4.992955762186531e-05, + "loss": 0.6365, + "step": 1198, + "task_loss": 0.9007152915000916 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.45795583724975586, + "epoch": 1.01, + "learning_rate": 4.9924861463323004e-05, + "loss": 0.4638, + "step": 1199, + "task_loss": 0.4552762508392334 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.1437170505523682, + "epoch": 1.01, + "learning_rate": 4.992016530478069e-05, + "loss": 0.9732, + "step": 1200, + "task_loss": 0.5305445194244385 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7056131958961487, + "epoch": 1.02, + "learning_rate": 4.9915469146238384e-05, + "loss": 0.8853, + "step": 1201, + "task_loss": 0.426228791475296 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.8267409801483154, + "epoch": 1.02, + "learning_rate": 4.991077298769607e-05, + "loss": 0.7443, + "step": 1202, + "task_loss": 0.4927351772785187 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5581204295158386, + "epoch": 1.02, + "learning_rate": 4.990607682915375e-05, + "loss": 0.7433, + "step": 1203, + "task_loss": 0.5641399621963501 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.8545762300491333, + "epoch": 1.02, + "learning_rate": 4.990138067061144e-05, + "loss": 0.8288, + "step": 1204, + "task_loss": 0.9960431456565857 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.0042822360992432, + "epoch": 1.02, + "learning_rate": 4.989668451206913e-05, + "loss": 0.7762, + "step": 1205, + "task_loss": 0.7500176429748535 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.720524787902832, + "epoch": 1.02, + "learning_rate": 4.989198835352682e-05, + "loss": 0.9325, + "step": 1206, + "task_loss": 1.6345655918121338 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.5197741985321045, + "epoch": 1.02, + "learning_rate": 4.98872921949845e-05, + "loss": 0.9269, + "step": 1207, + "task_loss": 0.77543705701828 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7119118571281433, + "epoch": 1.02, + "learning_rate": 4.9882596036442195e-05, + "loss": 0.9409, + "step": 1208, + "task_loss": 1.2722465991973877 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6984691619873047, + "epoch": 1.02, + "learning_rate": 4.987789987789988e-05, + "loss": 0.6093, + "step": 1209, + "task_loss": 1.0201630592346191 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.8998209834098816, + "epoch": 1.02, + "learning_rate": 4.987320371935757e-05, + "loss": 0.8854, + "step": 1210, + "task_loss": 1.6183542013168335 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.9749068021774292, + "epoch": 1.02, + "learning_rate": 4.9868507560815254e-05, + "loss": 0.8991, + "step": 1211, + "task_loss": 1.542296290397644 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.0921818017959595, + "epoch": 1.02, + "learning_rate": 4.986381140227294e-05, + "loss": 0.9252, + "step": 1212, + "task_loss": 1.5315409898757935 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.46266135573387146, + "epoch": 1.03, + "learning_rate": 4.985911524373063e-05, + "loss": 0.5816, + "step": 1213, + "task_loss": 0.39799585938453674 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7842148542404175, + "epoch": 1.03, + "learning_rate": 4.985441908518832e-05, + "loss": 0.8899, + "step": 1214, + "task_loss": 0.623865008354187 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7924678325653076, + "epoch": 1.03, + "learning_rate": 4.9849722926646006e-05, + "loss": 0.6813, + "step": 1215, + "task_loss": 1.4836411476135254 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7346034646034241, + "epoch": 1.03, + "learning_rate": 4.984502676810369e-05, + "loss": 0.8161, + "step": 1216, + "task_loss": 0.911013662815094 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6410455703735352, + "epoch": 1.03, + "learning_rate": 4.984033060956138e-05, + "loss": 0.7357, + "step": 1217, + "task_loss": 0.21396853029727936 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7787551879882812, + "epoch": 1.03, + "learning_rate": 4.983563445101907e-05, + "loss": 0.7078, + "step": 1218, + "task_loss": 2.689532518386841 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6562471389770508, + "epoch": 1.03, + "learning_rate": 4.983093829247676e-05, + "loss": 0.7621, + "step": 1219, + "task_loss": 0.6049256920814514 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.47610753774642944, + "epoch": 1.03, + "learning_rate": 4.9826242133934444e-05, + "loss": 0.6722, + "step": 1220, + "task_loss": 0.581636905670166 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7685062289237976, + "epoch": 1.03, + "learning_rate": 4.982154597539213e-05, + "loss": 0.647, + "step": 1221, + "task_loss": 1.3361307382583618 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6156908273696899, + "epoch": 1.03, + "learning_rate": 4.981684981684982e-05, + "loss": 0.6639, + "step": 1222, + "task_loss": 0.9097985029220581 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6651860475540161, + "epoch": 1.03, + "learning_rate": 4.981215365830751e-05, + "loss": 0.5451, + "step": 1223, + "task_loss": 0.1376887410879135 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.9289253354072571, + "epoch": 1.03, + "learning_rate": 4.980745749976519e-05, + "loss": 0.8983, + "step": 1224, + "task_loss": 1.403479814529419 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4269893765449524, + "epoch": 1.04, + "learning_rate": 4.980276134122288e-05, + "loss": 0.5757, + "step": 1225, + "task_loss": 0.4627838432788849 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4629426598548889, + "epoch": 1.04, + "learning_rate": 4.979806518268057e-05, + "loss": 0.6944, + "step": 1226, + "task_loss": 0.44136321544647217 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.8676228523254395, + "epoch": 1.04, + "learning_rate": 4.9793369024138256e-05, + "loss": 0.8026, + "step": 1227, + "task_loss": 1.024471640586853 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.0402860641479492, + "epoch": 1.04, + "learning_rate": 4.978867286559594e-05, + "loss": 0.7979, + "step": 1228, + "task_loss": 1.2598527669906616 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6387060880661011, + "epoch": 1.04, + "learning_rate": 4.978397670705363e-05, + "loss": 0.6613, + "step": 1229, + "task_loss": 0.3575829863548279 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5857902765274048, + "epoch": 1.04, + "learning_rate": 4.977928054851132e-05, + "loss": 0.7542, + "step": 1230, + "task_loss": 0.5660538077354431 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.863217830657959, + "epoch": 1.04, + "learning_rate": 4.977458438996901e-05, + "loss": 0.7411, + "step": 1231, + "task_loss": 0.7006186246871948 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.8919067978858948, + "epoch": 1.04, + "learning_rate": 4.97698882314267e-05, + "loss": 0.7488, + "step": 1232, + "task_loss": 0.7664223909378052 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7116492390632629, + "epoch": 1.04, + "learning_rate": 4.976519207288438e-05, + "loss": 0.6764, + "step": 1233, + "task_loss": 1.759402871131897 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7736808061599731, + "epoch": 1.04, + "learning_rate": 4.9760495914342073e-05, + "loss": 0.8395, + "step": 1234, + "task_loss": 0.8507089018821716 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6067366003990173, + "epoch": 1.04, + "learning_rate": 4.975579975579976e-05, + "loss": 0.7065, + "step": 1235, + "task_loss": 1.3265104293823242 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5824638605117798, + "epoch": 1.04, + "learning_rate": 4.9751103597257446e-05, + "loss": 0.6792, + "step": 1236, + "task_loss": 1.2145124673843384 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5448690056800842, + "epoch": 1.05, + "learning_rate": 4.974640743871513e-05, + "loss": 0.5802, + "step": 1237, + "task_loss": 1.0803186893463135 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.3911209106445312, + "epoch": 1.05, + "learning_rate": 4.974171128017282e-05, + "loss": 1.0132, + "step": 1238, + "task_loss": 1.088618278503418 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6184315085411072, + "epoch": 1.05, + "learning_rate": 4.973701512163051e-05, + "loss": 0.9596, + "step": 1239, + "task_loss": 0.1925496757030487 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7387796640396118, + "epoch": 1.05, + "learning_rate": 4.97323189630882e-05, + "loss": 0.805, + "step": 1240, + "task_loss": 0.52329421043396 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5322288870811462, + "epoch": 1.05, + "learning_rate": 4.9727622804545885e-05, + "loss": 0.63, + "step": 1241, + "task_loss": 0.03427287936210632 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5693274736404419, + "epoch": 1.05, + "learning_rate": 4.972292664600357e-05, + "loss": 0.6606, + "step": 1242, + "task_loss": 0.47436222434043884 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.8992235064506531, + "epoch": 1.05, + "learning_rate": 4.971823048746126e-05, + "loss": 0.6537, + "step": 1243, + "task_loss": 1.319307565689087 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5618818998336792, + "epoch": 1.05, + "learning_rate": 4.971353432891895e-05, + "loss": 0.5835, + "step": 1244, + "task_loss": 0.5415380001068115 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.9810945987701416, + "epoch": 1.05, + "learning_rate": 4.970883817037664e-05, + "loss": 0.8421, + "step": 1245, + "task_loss": 1.4955774545669556 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4485442340373993, + "epoch": 1.05, + "learning_rate": 4.970414201183432e-05, + "loss": 0.7466, + "step": 1246, + "task_loss": 0.7752739191055298 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4207412302494049, + "epoch": 1.05, + "learning_rate": 4.969944585329201e-05, + "loss": 0.72, + "step": 1247, + "task_loss": 0.2097749412059784 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7100080847740173, + "epoch": 1.05, + "learning_rate": 4.9694749694749696e-05, + "loss": 0.7058, + "step": 1248, + "task_loss": 0.5616713762283325 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.0599325895309448, + "epoch": 1.06, + "learning_rate": 4.969005353620739e-05, + "loss": 0.6725, + "step": 1249, + "task_loss": 1.0976859331130981 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7977098226547241, + "epoch": 1.06, + "learning_rate": 4.968535737766507e-05, + "loss": 0.8266, + "step": 1250, + "task_loss": 1.1248180866241455 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7050226330757141, + "epoch": 1.06, + "learning_rate": 4.968066121912276e-05, + "loss": 0.6759, + "step": 1251, + "task_loss": 0.8097040057182312 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.9294298887252808, + "epoch": 1.06, + "learning_rate": 4.967596506058045e-05, + "loss": 0.8009, + "step": 1252, + "task_loss": 1.0824682712554932 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.8007803559303284, + "epoch": 1.06, + "learning_rate": 4.9671268902038134e-05, + "loss": 0.7981, + "step": 1253, + "task_loss": 1.6562753915786743 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6191568970680237, + "epoch": 1.06, + "learning_rate": 4.966657274349582e-05, + "loss": 0.7563, + "step": 1254, + "task_loss": 1.4832048416137695 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5116437673568726, + "epoch": 1.06, + "learning_rate": 4.966187658495351e-05, + "loss": 0.5793, + "step": 1255, + "task_loss": 0.07216201722621918 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6266251802444458, + "epoch": 1.06, + "learning_rate": 4.96571804264112e-05, + "loss": 0.5851, + "step": 1256, + "task_loss": 0.587835967540741 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4913870394229889, + "epoch": 1.06, + "learning_rate": 4.9652484267868886e-05, + "loss": 0.6983, + "step": 1257, + "task_loss": 0.31397953629493713 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7225763201713562, + "epoch": 1.06, + "learning_rate": 4.964778810932657e-05, + "loss": 0.7204, + "step": 1258, + "task_loss": 1.4201581478118896 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.35853835940361023, + "epoch": 1.06, + "learning_rate": 4.964309195078426e-05, + "loss": 0.5001, + "step": 1259, + "task_loss": 0.052865203469991684 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.9247846007347107, + "epoch": 1.07, + "learning_rate": 4.9638395792241945e-05, + "loss": 0.9588, + "step": 1260, + "task_loss": 0.70647132396698 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6399618983268738, + "epoch": 1.07, + "learning_rate": 4.963369963369964e-05, + "loss": 0.6726, + "step": 1261, + "task_loss": 1.4716986417770386 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6145834922790527, + "epoch": 1.07, + "learning_rate": 4.9629003475157325e-05, + "loss": 0.5621, + "step": 1262, + "task_loss": 0.7125911116600037 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5137615203857422, + "epoch": 1.07, + "learning_rate": 4.962430731661501e-05, + "loss": 0.6575, + "step": 1263, + "task_loss": 0.68035888671875 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.8028764724731445, + "epoch": 1.07, + "learning_rate": 4.96196111580727e-05, + "loss": 0.8932, + "step": 1264, + "task_loss": 1.5125782489776611 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.627933144569397, + "epoch": 1.07, + "learning_rate": 4.961491499953039e-05, + "loss": 0.7321, + "step": 1265, + "task_loss": 0.8223745226860046 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7281454205513, + "epoch": 1.07, + "learning_rate": 4.961021884098808e-05, + "loss": 0.7866, + "step": 1266, + "task_loss": 0.7353380918502808 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7552920579910278, + "epoch": 1.07, + "learning_rate": 4.9605522682445757e-05, + "loss": 0.783, + "step": 1267, + "task_loss": 1.9622963666915894 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.805246889591217, + "epoch": 1.07, + "learning_rate": 4.960082652390345e-05, + "loss": 0.7606, + "step": 1268, + "task_loss": 0.6831367015838623 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5910681486129761, + "epoch": 1.07, + "learning_rate": 4.9596130365361136e-05, + "loss": 0.7526, + "step": 1269, + "task_loss": 0.5576522946357727 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.869104266166687, + "epoch": 1.07, + "learning_rate": 4.959143420681883e-05, + "loss": 0.8097, + "step": 1270, + "task_loss": 1.334325909614563 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.3867793083190918, + "epoch": 1.07, + "learning_rate": 4.958673804827651e-05, + "loss": 0.9968, + "step": 1271, + "task_loss": 0.39703118801116943 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7016822099685669, + "epoch": 1.08, + "learning_rate": 4.95820418897342e-05, + "loss": 0.6096, + "step": 1272, + "task_loss": 0.7889289855957031 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4595268964767456, + "epoch": 1.08, + "learning_rate": 4.957734573119189e-05, + "loss": 0.7103, + "step": 1273, + "task_loss": 0.9168697595596313 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5436785221099854, + "epoch": 1.08, + "learning_rate": 4.9572649572649575e-05, + "loss": 0.7123, + "step": 1274, + "task_loss": 0.8932439088821411 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.925026535987854, + "epoch": 1.08, + "learning_rate": 4.956795341410726e-05, + "loss": 0.8667, + "step": 1275, + "task_loss": 0.9106972813606262 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6150174140930176, + "epoch": 1.08, + "learning_rate": 4.956325725556495e-05, + "loss": 0.7665, + "step": 1276, + "task_loss": 0.6815412640571594 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7189949154853821, + "epoch": 1.08, + "learning_rate": 4.955856109702264e-05, + "loss": 0.782, + "step": 1277, + "task_loss": 0.5579949617385864 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.945213794708252, + "epoch": 1.08, + "learning_rate": 4.955386493848033e-05, + "loss": 0.8507, + "step": 1278, + "task_loss": 1.20814049243927 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5198201537132263, + "epoch": 1.08, + "learning_rate": 4.954916877993801e-05, + "loss": 0.6254, + "step": 1279, + "task_loss": 0.13831321895122528 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.9499718546867371, + "epoch": 1.08, + "learning_rate": 4.95444726213957e-05, + "loss": 0.7919, + "step": 1280, + "task_loss": 1.004666805267334 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.9411555528640747, + "epoch": 1.08, + "learning_rate": 4.9539776462853386e-05, + "loss": 0.7531, + "step": 1281, + "task_loss": 1.5318126678466797 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.9158565998077393, + "epoch": 1.08, + "learning_rate": 4.953508030431108e-05, + "loss": 0.712, + "step": 1282, + "task_loss": 1.5019237995147705 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7218375205993652, + "epoch": 1.08, + "learning_rate": 4.9530384145768765e-05, + "loss": 0.7939, + "step": 1283, + "task_loss": 1.0321540832519531 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6734896302223206, + "epoch": 1.09, + "learning_rate": 4.952568798722645e-05, + "loss": 0.6805, + "step": 1284, + "task_loss": 0.22773271799087524 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.49604663252830505, + "epoch": 1.09, + "learning_rate": 4.952099182868414e-05, + "loss": 0.5015, + "step": 1285, + "task_loss": 0.4866641163825989 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5691288709640503, + "epoch": 1.09, + "learning_rate": 4.9516295670141824e-05, + "loss": 0.7545, + "step": 1286, + "task_loss": 0.5059365630149841 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.37697482109069824, + "epoch": 1.09, + "learning_rate": 4.951159951159952e-05, + "loss": 0.6921, + "step": 1287, + "task_loss": 0.07297901064157486 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6461402177810669, + "epoch": 1.09, + "learning_rate": 4.95069033530572e-05, + "loss": 0.687, + "step": 1288, + "task_loss": 0.5798022747039795 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5700920820236206, + "epoch": 1.09, + "learning_rate": 4.950220719451489e-05, + "loss": 0.7963, + "step": 1289, + "task_loss": 0.028102673590183258 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7761542797088623, + "epoch": 1.09, + "learning_rate": 4.9497511035972576e-05, + "loss": 0.7498, + "step": 1290, + "task_loss": 0.7680091857910156 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7093247175216675, + "epoch": 1.09, + "learning_rate": 4.949281487743026e-05, + "loss": 0.8072, + "step": 1291, + "task_loss": 0.9535436034202576 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7288438081741333, + "epoch": 1.09, + "learning_rate": 4.9488118718887956e-05, + "loss": 0.8115, + "step": 1292, + "task_loss": 1.1577751636505127 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.8672157526016235, + "epoch": 1.09, + "learning_rate": 4.9483422560345635e-05, + "loss": 0.6668, + "step": 1293, + "task_loss": 0.5670070052146912 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.46113306283950806, + "epoch": 1.09, + "learning_rate": 4.947872640180333e-05, + "loss": 0.5732, + "step": 1294, + "task_loss": 0.48061829805374146 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5109890699386597, + "epoch": 1.09, + "learning_rate": 4.9474030243261015e-05, + "loss": 0.5816, + "step": 1295, + "task_loss": 0.6218225955963135 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7340753078460693, + "epoch": 1.1, + "learning_rate": 4.946933408471871e-05, + "loss": 0.6715, + "step": 1296, + "task_loss": 0.11878528445959091 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5694925785064697, + "epoch": 1.1, + "learning_rate": 4.946463792617639e-05, + "loss": 0.5517, + "step": 1297, + "task_loss": 0.3058575391769409 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.43904221057891846, + "epoch": 1.1, + "learning_rate": 4.9459941767634074e-05, + "loss": 0.4085, + "step": 1298, + "task_loss": 0.48544132709503174 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.1530795097351074, + "epoch": 1.1, + "learning_rate": 4.945524560909177e-05, + "loss": 0.7667, + "step": 1299, + "task_loss": 1.3456939458847046 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5567535161972046, + "epoch": 1.1, + "learning_rate": 4.945054945054945e-05, + "loss": 0.5951, + "step": 1300, + "task_loss": 0.15451423823833466 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6514447331428528, + "epoch": 1.1, + "learning_rate": 4.944585329200714e-05, + "loss": 0.6861, + "step": 1301, + "task_loss": 1.0917481184005737 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6338200569152832, + "epoch": 1.1, + "learning_rate": 4.9441157133464826e-05, + "loss": 0.6894, + "step": 1302, + "task_loss": 0.9866353273391724 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.83306884765625, + "epoch": 1.1, + "learning_rate": 4.943646097492252e-05, + "loss": 0.6316, + "step": 1303, + "task_loss": 0.9079781770706177 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6206182837486267, + "epoch": 1.1, + "learning_rate": 4.9431764816380205e-05, + "loss": 0.6452, + "step": 1304, + "task_loss": 1.4697750806808472 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4492408037185669, + "epoch": 1.1, + "learning_rate": 4.942706865783789e-05, + "loss": 0.7952, + "step": 1305, + "task_loss": 0.5300272703170776 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6468190550804138, + "epoch": 1.1, + "learning_rate": 4.942237249929558e-05, + "loss": 0.668, + "step": 1306, + "task_loss": 0.9608815312385559 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6154900193214417, + "epoch": 1.1, + "learning_rate": 4.9417676340753264e-05, + "loss": 0.6545, + "step": 1307, + "task_loss": 0.24266399443149567 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6244776248931885, + "epoch": 1.11, + "learning_rate": 4.941298018221096e-05, + "loss": 0.7551, + "step": 1308, + "task_loss": 0.7449018955230713 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.8633492588996887, + "epoch": 1.11, + "learning_rate": 4.9408284023668644e-05, + "loss": 0.6974, + "step": 1309, + "task_loss": 0.9308632612228394 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6614588499069214, + "epoch": 1.11, + "learning_rate": 4.940358786512633e-05, + "loss": 0.7627, + "step": 1310, + "task_loss": 1.6496520042419434 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5786677598953247, + "epoch": 1.11, + "learning_rate": 4.9398891706584017e-05, + "loss": 0.7021, + "step": 1311, + "task_loss": 0.5184558033943176 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.36565640568733215, + "epoch": 1.11, + "learning_rate": 4.93941955480417e-05, + "loss": 0.5384, + "step": 1312, + "task_loss": 0.19317564368247986 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.8879238367080688, + "epoch": 1.11, + "learning_rate": 4.9389499389499396e-05, + "loss": 0.7162, + "step": 1313, + "task_loss": 1.733871579170227 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.649004340171814, + "epoch": 1.11, + "learning_rate": 4.9384803230957076e-05, + "loss": 0.6792, + "step": 1314, + "task_loss": 1.2544273138046265 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4181835949420929, + "epoch": 1.11, + "learning_rate": 4.938010707241477e-05, + "loss": 0.5004, + "step": 1315, + "task_loss": 0.5713106393814087 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6857585906982422, + "epoch": 1.11, + "learning_rate": 4.9375410913872455e-05, + "loss": 0.5674, + "step": 1316, + "task_loss": 0.6371403336524963 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.8400878310203552, + "epoch": 1.11, + "learning_rate": 4.937071475533014e-05, + "loss": 0.7457, + "step": 1317, + "task_loss": 0.8803353905677795 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6222090721130371, + "epoch": 1.11, + "learning_rate": 4.936601859678783e-05, + "loss": 0.6496, + "step": 1318, + "task_loss": 0.8294748067855835 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.8189681172370911, + "epoch": 1.11, + "learning_rate": 4.9361322438245514e-05, + "loss": 0.5536, + "step": 1319, + "task_loss": 0.7919564843177795 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4970915913581848, + "epoch": 1.12, + "learning_rate": 4.935662627970321e-05, + "loss": 0.7303, + "step": 1320, + "task_loss": 0.5106635689735413 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.9206844568252563, + "epoch": 1.12, + "learning_rate": 4.9351930121160893e-05, + "loss": 0.6417, + "step": 1321, + "task_loss": 0.9685536026954651 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6231045722961426, + "epoch": 1.12, + "learning_rate": 4.934723396261858e-05, + "loss": 0.9221, + "step": 1322, + "task_loss": 1.9884967803955078 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6323938369750977, + "epoch": 1.12, + "learning_rate": 4.9342537804076266e-05, + "loss": 0.6219, + "step": 1323, + "task_loss": 0.816940188407898 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.775493323802948, + "epoch": 1.12, + "learning_rate": 4.933784164553395e-05, + "loss": 0.7818, + "step": 1324, + "task_loss": 0.6912090182304382 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6166880130767822, + "epoch": 1.12, + "learning_rate": 4.9333145486991646e-05, + "loss": 0.6944, + "step": 1325, + "task_loss": 0.27689340710639954 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5524451732635498, + "epoch": 1.12, + "learning_rate": 4.932844932844933e-05, + "loss": 0.631, + "step": 1326, + "task_loss": 0.5284366011619568 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5834505558013916, + "epoch": 1.12, + "learning_rate": 4.932375316990702e-05, + "loss": 0.8601, + "step": 1327, + "task_loss": 0.501453697681427 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6221016645431519, + "epoch": 1.12, + "learning_rate": 4.9319057011364705e-05, + "loss": 0.5564, + "step": 1328, + "task_loss": 0.5623244643211365 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.47824230790138245, + "epoch": 1.12, + "learning_rate": 4.93143608528224e-05, + "loss": 0.8025, + "step": 1329, + "task_loss": 0.16670876741409302 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.779980480670929, + "epoch": 1.12, + "learning_rate": 4.9309664694280084e-05, + "loss": 0.8125, + "step": 1330, + "task_loss": 1.2094398736953735 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6504977941513062, + "epoch": 1.13, + "learning_rate": 4.9304968535737764e-05, + "loss": 0.6747, + "step": 1331, + "task_loss": 1.3107285499572754 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4757450520992279, + "epoch": 1.13, + "learning_rate": 4.930027237719546e-05, + "loss": 0.7751, + "step": 1332, + "task_loss": 0.765080988407135 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.41826513409614563, + "epoch": 1.13, + "learning_rate": 4.929557621865314e-05, + "loss": 0.5976, + "step": 1333, + "task_loss": 0.34595662355422974 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4094617962837219, + "epoch": 1.13, + "learning_rate": 4.9290880060110836e-05, + "loss": 0.6009, + "step": 1334, + "task_loss": 0.730431079864502 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.8176749348640442, + "epoch": 1.13, + "learning_rate": 4.9286183901568516e-05, + "loss": 0.8557, + "step": 1335, + "task_loss": 1.1459325551986694 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.8718534111976624, + "epoch": 1.13, + "learning_rate": 4.928148774302621e-05, + "loss": 0.9376, + "step": 1336, + "task_loss": 1.2973120212554932 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3558146357536316, + "epoch": 1.13, + "learning_rate": 4.9276791584483895e-05, + "loss": 0.8362, + "step": 1337, + "task_loss": 0.5811787247657776 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.0423654317855835, + "epoch": 1.13, + "learning_rate": 4.927209542594158e-05, + "loss": 0.7964, + "step": 1338, + "task_loss": 1.681256651878357 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7893009781837463, + "epoch": 1.13, + "learning_rate": 4.9267399267399275e-05, + "loss": 0.6467, + "step": 1339, + "task_loss": 1.0758448839187622 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.8247897624969482, + "epoch": 1.13, + "learning_rate": 4.9262703108856954e-05, + "loss": 0.7105, + "step": 1340, + "task_loss": 1.522544503211975 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6625708341598511, + "epoch": 1.13, + "learning_rate": 4.925800695031465e-05, + "loss": 0.9045, + "step": 1341, + "task_loss": 0.3208436071872711 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.46288660168647766, + "epoch": 1.13, + "learning_rate": 4.9253310791772334e-05, + "loss": 0.7967, + "step": 1342, + "task_loss": 0.40859881043434143 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4174606204032898, + "epoch": 1.14, + "learning_rate": 4.924861463323002e-05, + "loss": 0.5645, + "step": 1343, + "task_loss": 0.18321073055267334 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.1893832683563232, + "epoch": 1.14, + "learning_rate": 4.9243918474687706e-05, + "loss": 0.823, + "step": 1344, + "task_loss": 1.0280473232269287 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3973398208618164, + "epoch": 1.14, + "learning_rate": 4.923922231614539e-05, + "loss": 0.5911, + "step": 1345, + "task_loss": 0.6870942115783691 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6313991546630859, + "epoch": 1.14, + "learning_rate": 4.9234526157603086e-05, + "loss": 0.6875, + "step": 1346, + "task_loss": 0.9727690815925598 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6980172395706177, + "epoch": 1.14, + "learning_rate": 4.922982999906077e-05, + "loss": 0.7374, + "step": 1347, + "task_loss": 0.5002503991127014 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5147057175636292, + "epoch": 1.14, + "learning_rate": 4.922513384051846e-05, + "loss": 0.814, + "step": 1348, + "task_loss": 0.9428136348724365 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3389057517051697, + "epoch": 1.14, + "learning_rate": 4.9220437681976145e-05, + "loss": 0.5903, + "step": 1349, + "task_loss": 0.5447511672973633 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6059243083000183, + "epoch": 1.14, + "learning_rate": 4.921574152343383e-05, + "loss": 0.609, + "step": 1350, + "task_loss": 1.8156611919403076 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.41145578026771545, + "epoch": 1.14, + "learning_rate": 4.9211045364891524e-05, + "loss": 0.728, + "step": 1351, + "task_loss": 0.9913026094436646 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6190500259399414, + "epoch": 1.14, + "learning_rate": 4.9206349206349204e-05, + "loss": 0.7839, + "step": 1352, + "task_loss": 0.978970468044281 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.8000482320785522, + "epoch": 1.14, + "learning_rate": 4.92016530478069e-05, + "loss": 0.7777, + "step": 1353, + "task_loss": 0.5483626127243042 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.38977473974227905, + "epoch": 1.14, + "learning_rate": 4.919695688926458e-05, + "loss": 0.581, + "step": 1354, + "task_loss": 0.6048043370246887 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5673363208770752, + "epoch": 1.15, + "learning_rate": 4.919226073072227e-05, + "loss": 0.7416, + "step": 1355, + "task_loss": 0.5679433941841125 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.9642918109893799, + "epoch": 1.15, + "learning_rate": 4.918756457217996e-05, + "loss": 0.8678, + "step": 1356, + "task_loss": 0.20047631859779358 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3476962447166443, + "epoch": 1.15, + "learning_rate": 4.918286841363764e-05, + "loss": 0.8088, + "step": 1357, + "task_loss": 0.24324047565460205 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.9156034588813782, + "epoch": 1.15, + "learning_rate": 4.9178172255095335e-05, + "loss": 0.8393, + "step": 1358, + "task_loss": 1.9249433279037476 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.763570249080658, + "epoch": 1.15, + "learning_rate": 4.917347609655302e-05, + "loss": 0.6236, + "step": 1359, + "task_loss": 0.23107849061489105 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.9968781471252441, + "epoch": 1.15, + "learning_rate": 4.9168779938010715e-05, + "loss": 0.7562, + "step": 1360, + "task_loss": 0.545785665512085 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6500983238220215, + "epoch": 1.15, + "learning_rate": 4.9164083779468394e-05, + "loss": 0.7808, + "step": 1361, + "task_loss": 0.6783839464187622 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4511190354824066, + "epoch": 1.15, + "learning_rate": 4.915938762092608e-05, + "loss": 0.6541, + "step": 1362, + "task_loss": 0.5171618461608887 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4862748086452484, + "epoch": 1.15, + "learning_rate": 4.9154691462383774e-05, + "loss": 0.8317, + "step": 1363, + "task_loss": 0.8363558053970337 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.40697258710861206, + "epoch": 1.15, + "learning_rate": 4.914999530384146e-05, + "loss": 0.6001, + "step": 1364, + "task_loss": 0.5757407546043396 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.42105549573898315, + "epoch": 1.15, + "learning_rate": 4.9145299145299147e-05, + "loss": 0.6072, + "step": 1365, + "task_loss": 1.1838321685791016 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.9189705848693848, + "epoch": 1.15, + "learning_rate": 4.914060298675683e-05, + "loss": 0.7744, + "step": 1366, + "task_loss": 1.1158034801483154 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5393441915512085, + "epoch": 1.16, + "learning_rate": 4.9135906828214526e-05, + "loss": 0.5768, + "step": 1367, + "task_loss": 0.4702691435813904 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.0310040712356567, + "epoch": 1.16, + "learning_rate": 4.913121066967221e-05, + "loss": 0.7056, + "step": 1368, + "task_loss": 1.0094150304794312 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.38820499181747437, + "epoch": 1.16, + "learning_rate": 4.91265145111299e-05, + "loss": 0.7687, + "step": 1369, + "task_loss": 0.5620743036270142 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5300425291061401, + "epoch": 1.16, + "learning_rate": 4.9121818352587585e-05, + "loss": 0.5418, + "step": 1370, + "task_loss": 0.9790509343147278 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6569465398788452, + "epoch": 1.16, + "learning_rate": 4.911712219404527e-05, + "loss": 0.5937, + "step": 1371, + "task_loss": 1.4016033411026 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6341307163238525, + "epoch": 1.16, + "learning_rate": 4.9112426035502965e-05, + "loss": 0.6251, + "step": 1372, + "task_loss": 0.5100233554840088 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.8187536597251892, + "epoch": 1.16, + "learning_rate": 4.910772987696065e-05, + "loss": 0.77, + "step": 1373, + "task_loss": 0.9382261633872986 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7493864297866821, + "epoch": 1.16, + "learning_rate": 4.910303371841834e-05, + "loss": 0.8435, + "step": 1374, + "task_loss": 1.4655905961990356 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5396281480789185, + "epoch": 1.16, + "learning_rate": 4.9098337559876024e-05, + "loss": 0.76, + "step": 1375, + "task_loss": 1.3066457509994507 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7906984090805054, + "epoch": 1.16, + "learning_rate": 4.909364140133371e-05, + "loss": 0.739, + "step": 1376, + "task_loss": 0.7647501826286316 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7042509317398071, + "epoch": 1.16, + "learning_rate": 4.90889452427914e-05, + "loss": 0.7252, + "step": 1377, + "task_loss": 1.0582749843597412 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7503671646118164, + "epoch": 1.16, + "learning_rate": 4.908424908424908e-05, + "loss": 0.7909, + "step": 1378, + "task_loss": 0.24215669929981232 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5873064994812012, + "epoch": 1.17, + "learning_rate": 4.9079552925706776e-05, + "loss": 0.6132, + "step": 1379, + "task_loss": 0.7812985181808472 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.8959978818893433, + "epoch": 1.17, + "learning_rate": 4.907485676716446e-05, + "loss": 0.7086, + "step": 1380, + "task_loss": 0.7291380167007446 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.0126283168792725, + "epoch": 1.17, + "learning_rate": 4.907016060862215e-05, + "loss": 0.7556, + "step": 1381, + "task_loss": 1.4942545890808105 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.8307521343231201, + "epoch": 1.17, + "learning_rate": 4.9065464450079835e-05, + "loss": 0.6981, + "step": 1382, + "task_loss": 0.6868078112602234 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.634753406047821, + "epoch": 1.17, + "learning_rate": 4.906076829153752e-05, + "loss": 0.6839, + "step": 1383, + "task_loss": 2.4826629161834717 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.9333901405334473, + "epoch": 1.17, + "learning_rate": 4.9056072132995214e-05, + "loss": 0.6132, + "step": 1384, + "task_loss": 0.7142943143844604 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6435319185256958, + "epoch": 1.17, + "learning_rate": 4.90513759744529e-05, + "loss": 0.6868, + "step": 1385, + "task_loss": 0.8403874635696411 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6717526912689209, + "epoch": 1.17, + "learning_rate": 4.904667981591059e-05, + "loss": 0.6066, + "step": 1386, + "task_loss": 0.6957762837409973 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3425019681453705, + "epoch": 1.17, + "learning_rate": 4.904198365736827e-05, + "loss": 0.6661, + "step": 1387, + "task_loss": 0.25099050998687744 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6465728878974915, + "epoch": 1.17, + "learning_rate": 4.903728749882596e-05, + "loss": 0.6325, + "step": 1388, + "task_loss": 0.41266924142837524 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6118613481521606, + "epoch": 1.17, + "learning_rate": 4.903259134028365e-05, + "loss": 0.6334, + "step": 1389, + "task_loss": 1.2684624195098877 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7302461862564087, + "epoch": 1.17, + "learning_rate": 4.902789518174134e-05, + "loss": 0.7814, + "step": 1390, + "task_loss": 0.7224894762039185 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5329036712646484, + "epoch": 1.18, + "learning_rate": 4.9023199023199025e-05, + "loss": 0.6941, + "step": 1391, + "task_loss": 0.7438899874687195 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.828599214553833, + "epoch": 1.18, + "learning_rate": 4.901850286465671e-05, + "loss": 0.8444, + "step": 1392, + "task_loss": 0.4139252007007599 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6210654377937317, + "epoch": 1.18, + "learning_rate": 4.90138067061144e-05, + "loss": 0.903, + "step": 1393, + "task_loss": 1.1307477951049805 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5672077536582947, + "epoch": 1.18, + "learning_rate": 4.900911054757209e-05, + "loss": 0.7461, + "step": 1394, + "task_loss": 1.317976474761963 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.5671592950820923, + "epoch": 1.18, + "learning_rate": 4.900441438902977e-05, + "loss": 0.8757, + "step": 1395, + "task_loss": 0.8834158778190613 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.0163724422454834, + "epoch": 1.18, + "learning_rate": 4.8999718230487464e-05, + "loss": 0.7693, + "step": 1396, + "task_loss": 1.3023768663406372 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5705899596214294, + "epoch": 1.18, + "learning_rate": 4.899502207194515e-05, + "loss": 0.7207, + "step": 1397, + "task_loss": 1.1483713388442993 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7187027931213379, + "epoch": 1.18, + "learning_rate": 4.899032591340284e-05, + "loss": 0.672, + "step": 1398, + "task_loss": 0.4588943421840668 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7078155875205994, + "epoch": 1.18, + "learning_rate": 4.898562975486053e-05, + "loss": 0.6821, + "step": 1399, + "task_loss": 1.7584826946258545 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7822513580322266, + "epoch": 1.18, + "learning_rate": 4.898093359631821e-05, + "loss": 0.6203, + "step": 1400, + "task_loss": 0.9964765906333923 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6452760100364685, + "epoch": 1.18, + "learning_rate": 4.89762374377759e-05, + "loss": 0.8515, + "step": 1401, + "task_loss": 1.3357552289962769 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.82828688621521, + "epoch": 1.19, + "learning_rate": 4.897154127923359e-05, + "loss": 0.55, + "step": 1402, + "task_loss": 1.0566030740737915 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6824697256088257, + "epoch": 1.19, + "learning_rate": 4.896684512069128e-05, + "loss": 0.5429, + "step": 1403, + "task_loss": 1.867529273033142 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5981334447860718, + "epoch": 1.19, + "learning_rate": 4.896214896214896e-05, + "loss": 0.5964, + "step": 1404, + "task_loss": 0.8120792508125305 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4322366714477539, + "epoch": 1.19, + "learning_rate": 4.8957452803606654e-05, + "loss": 0.6743, + "step": 1405, + "task_loss": 0.9135465621948242 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6989839673042297, + "epoch": 1.19, + "learning_rate": 4.895275664506434e-05, + "loss": 0.6595, + "step": 1406, + "task_loss": 1.323115348815918 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7032515406608582, + "epoch": 1.19, + "learning_rate": 4.894806048652203e-05, + "loss": 0.8113, + "step": 1407, + "task_loss": 1.4239848852157593 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5719220042228699, + "epoch": 1.19, + "learning_rate": 4.8943364327979713e-05, + "loss": 0.6589, + "step": 1408, + "task_loss": 1.262266993522644 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.8297879695892334, + "epoch": 1.19, + "learning_rate": 4.89386681694374e-05, + "loss": 0.7681, + "step": 1409, + "task_loss": 1.1444751024246216 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.8285918235778809, + "epoch": 1.19, + "learning_rate": 4.893397201089509e-05, + "loss": 0.6378, + "step": 1410, + "task_loss": 1.5723944902420044 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.3582594394683838, + "epoch": 1.19, + "learning_rate": 4.892927585235278e-05, + "loss": 1.0444, + "step": 1411, + "task_loss": 1.5388820171356201 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.49790093302726746, + "epoch": 1.19, + "learning_rate": 4.8924579693810466e-05, + "loss": 0.741, + "step": 1412, + "task_loss": 1.040041208267212 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.8510262966156006, + "epoch": 1.19, + "learning_rate": 4.891988353526815e-05, + "loss": 0.5667, + "step": 1413, + "task_loss": 0.9218040108680725 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7334065437316895, + "epoch": 1.2, + "learning_rate": 4.891518737672584e-05, + "loss": 0.6119, + "step": 1414, + "task_loss": 1.0949002504348755 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6995035409927368, + "epoch": 1.2, + "learning_rate": 4.891049121818353e-05, + "loss": 0.8343, + "step": 1415, + "task_loss": 1.597287893295288 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5928952693939209, + "epoch": 1.2, + "learning_rate": 4.890579505964122e-05, + "loss": 0.6578, + "step": 1416, + "task_loss": 1.0109384059906006 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.9946739077568054, + "epoch": 1.2, + "learning_rate": 4.8901098901098904e-05, + "loss": 0.7741, + "step": 1417, + "task_loss": 1.490555763244629 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6486961841583252, + "epoch": 1.2, + "learning_rate": 4.889640274255659e-05, + "loss": 0.7022, + "step": 1418, + "task_loss": 1.1189180612564087 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.46602484583854675, + "epoch": 1.2, + "learning_rate": 4.889170658401428e-05, + "loss": 0.8156, + "step": 1419, + "task_loss": 0.8417402505874634 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4847773611545563, + "epoch": 1.2, + "learning_rate": 4.888701042547197e-05, + "loss": 0.6477, + "step": 1420, + "task_loss": 0.04592149704694748 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2685670256614685, + "epoch": 1.2, + "learning_rate": 4.888231426692965e-05, + "loss": 0.4722, + "step": 1421, + "task_loss": 0.040146518498659134 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.1056642532348633, + "epoch": 1.2, + "learning_rate": 4.887761810838734e-05, + "loss": 0.777, + "step": 1422, + "task_loss": 1.6640281677246094 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4153333604335785, + "epoch": 1.2, + "learning_rate": 4.887292194984503e-05, + "loss": 0.6116, + "step": 1423, + "task_loss": 0.621565580368042 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6438086628913879, + "epoch": 1.2, + "learning_rate": 4.886822579130272e-05, + "loss": 0.6486, + "step": 1424, + "task_loss": 1.165236473083496 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.8604700565338135, + "epoch": 1.2, + "learning_rate": 4.88635296327604e-05, + "loss": 0.8168, + "step": 1425, + "task_loss": 1.1765363216400146 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.46633028984069824, + "epoch": 1.21, + "learning_rate": 4.885883347421809e-05, + "loss": 0.4986, + "step": 1426, + "task_loss": 1.0633851289749146 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.8429061770439148, + "epoch": 1.21, + "learning_rate": 4.885413731567578e-05, + "loss": 0.7725, + "step": 1427, + "task_loss": 1.5832393169403076 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.8090022802352905, + "epoch": 1.21, + "learning_rate": 4.884944115713347e-05, + "loss": 0.7922, + "step": 1428, + "task_loss": 1.664628267288208 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.8478543758392334, + "epoch": 1.21, + "learning_rate": 4.8844744998591154e-05, + "loss": 0.7248, + "step": 1429, + "task_loss": 1.1572437286376953 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7138170599937439, + "epoch": 1.21, + "learning_rate": 4.884004884004884e-05, + "loss": 0.6401, + "step": 1430, + "task_loss": 0.8682454824447632 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5325661897659302, + "epoch": 1.21, + "learning_rate": 4.883535268150653e-05, + "loss": 0.6, + "step": 1431, + "task_loss": 0.8485495448112488 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.207361102104187, + "epoch": 1.21, + "learning_rate": 4.883065652296422e-05, + "loss": 0.8543, + "step": 1432, + "task_loss": 1.496865153312683 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3847033381462097, + "epoch": 1.21, + "learning_rate": 4.8825960364421906e-05, + "loss": 0.6112, + "step": 1433, + "task_loss": 0.16500955820083618 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5243787169456482, + "epoch": 1.21, + "learning_rate": 4.882126420587959e-05, + "loss": 0.6966, + "step": 1434, + "task_loss": 0.7289472818374634 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.36489444971084595, + "epoch": 1.21, + "learning_rate": 4.881656804733728e-05, + "loss": 0.5224, + "step": 1435, + "task_loss": 0.3598242998123169 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.929207980632782, + "epoch": 1.21, + "learning_rate": 4.881187188879497e-05, + "loss": 0.65, + "step": 1436, + "task_loss": 0.89776211977005 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7676011323928833, + "epoch": 1.21, + "learning_rate": 4.880717573025266e-05, + "loss": 0.5714, + "step": 1437, + "task_loss": 0.7398805618286133 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6778333187103271, + "epoch": 1.22, + "learning_rate": 4.8802479571710344e-05, + "loss": 0.696, + "step": 1438, + "task_loss": 0.5839777588844299 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.5627596378326416, + "epoch": 1.22, + "learning_rate": 4.879778341316803e-05, + "loss": 1.0132, + "step": 1439, + "task_loss": 1.677577018737793 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7751020193099976, + "epoch": 1.22, + "learning_rate": 4.879308725462572e-05, + "loss": 0.6619, + "step": 1440, + "task_loss": 0.3062921166419983 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.0167386531829834, + "epoch": 1.22, + "learning_rate": 4.878839109608341e-05, + "loss": 0.8926, + "step": 1441, + "task_loss": 0.772346556186676 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6314076781272888, + "epoch": 1.22, + "learning_rate": 4.878369493754109e-05, + "loss": 0.5538, + "step": 1442, + "task_loss": 0.8512011170387268 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4654419422149658, + "epoch": 1.22, + "learning_rate": 4.877899877899878e-05, + "loss": 0.6992, + "step": 1443, + "task_loss": 0.6515836715698242 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.35642409324645996, + "epoch": 1.22, + "learning_rate": 4.877430262045647e-05, + "loss": 0.493, + "step": 1444, + "task_loss": 0.24259091913700104 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6541587710380554, + "epoch": 1.22, + "learning_rate": 4.8769606461914155e-05, + "loss": 0.771, + "step": 1445, + "task_loss": 0.4378105401992798 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.0534685850143433, + "epoch": 1.22, + "learning_rate": 4.876491030337185e-05, + "loss": 0.7387, + "step": 1446, + "task_loss": 1.2129827737808228 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.2433269023895264, + "epoch": 1.22, + "learning_rate": 4.876021414482953e-05, + "loss": 0.8619, + "step": 1447, + "task_loss": 1.5188193321228027 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.757678210735321, + "epoch": 1.22, + "learning_rate": 4.875551798628722e-05, + "loss": 0.5173, + "step": 1448, + "task_loss": 0.6181909441947937 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5193491578102112, + "epoch": 1.22, + "learning_rate": 4.875082182774491e-05, + "loss": 0.5844, + "step": 1449, + "task_loss": 0.29249513149261475 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.8145077228546143, + "epoch": 1.23, + "learning_rate": 4.8746125669202594e-05, + "loss": 0.7726, + "step": 1450, + "task_loss": 1.3930768966674805 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.594058632850647, + "epoch": 1.23, + "learning_rate": 4.874142951066028e-05, + "loss": 0.6594, + "step": 1451, + "task_loss": 0.8744868040084839 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.9866927862167358, + "epoch": 1.23, + "learning_rate": 4.8736733352117967e-05, + "loss": 0.6829, + "step": 1452, + "task_loss": 2.104647159576416 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5556595325469971, + "epoch": 1.23, + "learning_rate": 4.873203719357566e-05, + "loss": 0.7088, + "step": 1453, + "task_loss": 0.4150608777999878 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4429343342781067, + "epoch": 1.23, + "learning_rate": 4.8727341035033346e-05, + "loss": 0.5612, + "step": 1454, + "task_loss": 0.7755537033081055 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.49817875027656555, + "epoch": 1.23, + "learning_rate": 4.872264487649103e-05, + "loss": 0.5209, + "step": 1455, + "task_loss": 0.7862489819526672 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3651057481765747, + "epoch": 1.23, + "learning_rate": 4.871794871794872e-05, + "loss": 0.8477, + "step": 1456, + "task_loss": 0.3501637279987335 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5044481754302979, + "epoch": 1.23, + "learning_rate": 4.8713252559406405e-05, + "loss": 0.7084, + "step": 1457, + "task_loss": 0.41368645429611206 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7046396732330322, + "epoch": 1.23, + "learning_rate": 4.87085564008641e-05, + "loss": 0.8082, + "step": 1458, + "task_loss": 1.2316187620162964 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.616681694984436, + "epoch": 1.23, + "learning_rate": 4.870386024232178e-05, + "loss": 0.7432, + "step": 1459, + "task_loss": 0.7891397476196289 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6617809534072876, + "epoch": 1.23, + "learning_rate": 4.869916408377947e-05, + "loss": 0.7505, + "step": 1460, + "task_loss": 2.1454050540924072 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7211824059486389, + "epoch": 1.23, + "learning_rate": 4.869446792523716e-05, + "loss": 0.6755, + "step": 1461, + "task_loss": 1.1156843900680542 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6543716192245483, + "epoch": 1.24, + "learning_rate": 4.868977176669485e-05, + "loss": 0.7326, + "step": 1462, + "task_loss": 0.4479277729988098 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.49115681648254395, + "epoch": 1.24, + "learning_rate": 4.868507560815254e-05, + "loss": 0.6575, + "step": 1463, + "task_loss": 0.5252255797386169 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.47477197647094727, + "epoch": 1.24, + "learning_rate": 4.8680379449610216e-05, + "loss": 0.623, + "step": 1464, + "task_loss": 1.4158002138137817 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5702205896377563, + "epoch": 1.24, + "learning_rate": 4.867568329106791e-05, + "loss": 0.6556, + "step": 1465, + "task_loss": 0.9019696116447449 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.468389630317688, + "epoch": 1.24, + "learning_rate": 4.8670987132525596e-05, + "loss": 0.5573, + "step": 1466, + "task_loss": 0.652245044708252 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4848339855670929, + "epoch": 1.24, + "learning_rate": 4.866629097398329e-05, + "loss": 0.6065, + "step": 1467, + "task_loss": 0.4609755277633667 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6450019478797913, + "epoch": 1.24, + "learning_rate": 4.866159481544097e-05, + "loss": 0.6613, + "step": 1468, + "task_loss": 1.2018921375274658 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.3709689378738403, + "epoch": 1.24, + "learning_rate": 4.865689865689866e-05, + "loss": 0.7391, + "step": 1469, + "task_loss": 1.4850081205368042 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5193872451782227, + "epoch": 1.24, + "learning_rate": 4.865220249835635e-05, + "loss": 0.5853, + "step": 1470, + "task_loss": 0.6702150106430054 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.547351062297821, + "epoch": 1.24, + "learning_rate": 4.8647506339814034e-05, + "loss": 0.5512, + "step": 1471, + "task_loss": 0.9600668549537659 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6322142481803894, + "epoch": 1.24, + "learning_rate": 4.864281018127172e-05, + "loss": 0.6738, + "step": 1472, + "task_loss": 0.6918298006057739 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7747842073440552, + "epoch": 1.24, + "learning_rate": 4.863811402272941e-05, + "loss": 0.6032, + "step": 1473, + "task_loss": 0.6017622947692871 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6447668075561523, + "epoch": 1.25, + "learning_rate": 4.86334178641871e-05, + "loss": 0.574, + "step": 1474, + "task_loss": 0.41675034165382385 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4318716526031494, + "epoch": 1.25, + "learning_rate": 4.8628721705644786e-05, + "loss": 0.572, + "step": 1475, + "task_loss": 1.0527219772338867 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.676533043384552, + "epoch": 1.25, + "learning_rate": 4.862402554710247e-05, + "loss": 0.5477, + "step": 1476, + "task_loss": 1.4153600931167603 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6107629537582397, + "epoch": 1.25, + "learning_rate": 4.861932938856016e-05, + "loss": 0.749, + "step": 1477, + "task_loss": 0.7063106298446655 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.48590511083602905, + "epoch": 1.25, + "learning_rate": 4.8614633230017845e-05, + "loss": 0.5163, + "step": 1478, + "task_loss": 0.8698598742485046 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.42161017656326294, + "epoch": 1.25, + "learning_rate": 4.860993707147554e-05, + "loss": 0.5406, + "step": 1479, + "task_loss": 0.6526816487312317 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.36456745862960815, + "epoch": 1.25, + "learning_rate": 4.8605240912933225e-05, + "loss": 0.53, + "step": 1480, + "task_loss": 0.6898282766342163 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.9475955963134766, + "epoch": 1.25, + "learning_rate": 4.860054475439091e-05, + "loss": 0.616, + "step": 1481, + "task_loss": 1.2943830490112305 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5560633540153503, + "epoch": 1.25, + "learning_rate": 4.85958485958486e-05, + "loss": 0.637, + "step": 1482, + "task_loss": 1.953587532043457 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5974971652030945, + "epoch": 1.25, + "learning_rate": 4.8591152437306284e-05, + "loss": 0.8936, + "step": 1483, + "task_loss": 1.8227041959762573 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6112303733825684, + "epoch": 1.25, + "learning_rate": 4.858645627876398e-05, + "loss": 0.6685, + "step": 1484, + "task_loss": 0.3551182150840759 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5870187878608704, + "epoch": 1.26, + "learning_rate": 4.8581760120221656e-05, + "loss": 0.8833, + "step": 1485, + "task_loss": 0.7022114992141724 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.33385583758354187, + "epoch": 1.26, + "learning_rate": 4.857706396167935e-05, + "loss": 0.5457, + "step": 1486, + "task_loss": 0.8311796188354492 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.37253236770629883, + "epoch": 1.26, + "learning_rate": 4.8572367803137036e-05, + "loss": 0.4951, + "step": 1487, + "task_loss": 0.46015480160713196 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6860008835792542, + "epoch": 1.26, + "learning_rate": 4.856767164459472e-05, + "loss": 0.5868, + "step": 1488, + "task_loss": 0.6789003014564514 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.44354456663131714, + "epoch": 1.26, + "learning_rate": 4.856297548605241e-05, + "loss": 0.6505, + "step": 1489, + "task_loss": 0.6948358416557312 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7355843186378479, + "epoch": 1.26, + "learning_rate": 4.8558279327510095e-05, + "loss": 0.7142, + "step": 1490, + "task_loss": 0.5633959770202637 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5250322818756104, + "epoch": 1.26, + "learning_rate": 4.855358316896779e-05, + "loss": 0.631, + "step": 1491, + "task_loss": 0.6718034744262695 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5688008666038513, + "epoch": 1.26, + "learning_rate": 4.8548887010425474e-05, + "loss": 0.4647, + "step": 1492, + "task_loss": 1.1614246368408203 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.8129573464393616, + "epoch": 1.26, + "learning_rate": 4.854419085188317e-05, + "loss": 0.6647, + "step": 1493, + "task_loss": 1.364094614982605 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.9073379039764404, + "epoch": 1.26, + "learning_rate": 4.853949469334085e-05, + "loss": 0.9391, + "step": 1494, + "task_loss": 0.48791617155075073 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.622846245765686, + "epoch": 1.26, + "learning_rate": 4.8534798534798533e-05, + "loss": 0.657, + "step": 1495, + "task_loss": 1.2407217025756836 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5800409913063049, + "epoch": 1.26, + "learning_rate": 4.8530102376256227e-05, + "loss": 0.4548, + "step": 1496, + "task_loss": 0.603920042514801 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.8032512664794922, + "epoch": 1.27, + "learning_rate": 4.852540621771391e-05, + "loss": 0.6589, + "step": 1497, + "task_loss": 0.5093135833740234 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6750850677490234, + "epoch": 1.27, + "learning_rate": 4.85207100591716e-05, + "loss": 0.5619, + "step": 1498, + "task_loss": 0.8857743144035339 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.606971263885498, + "epoch": 1.27, + "learning_rate": 4.8516013900629286e-05, + "loss": 0.5495, + "step": 1499, + "task_loss": 1.0862387418746948 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.8809822797775269, + "epoch": 1.27, + "learning_rate": 4.851131774208698e-05, + "loss": 0.816, + "step": 1500, + "task_loss": 2.204951286315918 + }, + { + "epoch": 1.27, + "eval_accuracy": 0.8954059405940594, + "eval_loss": 0.3943859934806824, + "eval_runtime": 228.6362, + "eval_samples_per_second": 110.437, + "eval_steps_per_second": 0.866, + "step": 1500 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.31930065155029297, + "epoch": 1.27, + "learning_rate": 4.8506621583544665e-05, + "loss": 0.6523, + "step": 1501, + "task_loss": 1.2492903470993042 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7212862968444824, + "epoch": 1.27, + "learning_rate": 4.850192542500235e-05, + "loss": 0.6757, + "step": 1502, + "task_loss": 0.819610595703125 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2794511318206787, + "epoch": 1.27, + "learning_rate": 4.849722926646004e-05, + "loss": 0.4095, + "step": 1503, + "task_loss": 1.114783763885498 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.794372022151947, + "epoch": 1.27, + "learning_rate": 4.8492533107917724e-05, + "loss": 0.7523, + "step": 1504, + "task_loss": 0.6503158807754517 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.9146854877471924, + "epoch": 1.27, + "learning_rate": 4.848783694937542e-05, + "loss": 0.7378, + "step": 1505, + "task_loss": 0.7169913053512573 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5197542905807495, + "epoch": 1.27, + "learning_rate": 4.84831407908331e-05, + "loss": 0.6416, + "step": 1506, + "task_loss": 0.8979859948158264 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.35962212085723877, + "epoch": 1.27, + "learning_rate": 4.847844463229079e-05, + "loss": 0.5496, + "step": 1507, + "task_loss": 0.6904214024543762 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.9617994427680969, + "epoch": 1.27, + "learning_rate": 4.8473748473748476e-05, + "loss": 0.7286, + "step": 1508, + "task_loss": 1.0132970809936523 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7497919201850891, + "epoch": 1.28, + "learning_rate": 4.846905231520616e-05, + "loss": 0.669, + "step": 1509, + "task_loss": 0.29590731859207153 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.49269795417785645, + "epoch": 1.28, + "learning_rate": 4.8464356156663856e-05, + "loss": 0.658, + "step": 1510, + "task_loss": 1.5304381847381592 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6285421848297119, + "epoch": 1.28, + "learning_rate": 4.8459659998121535e-05, + "loss": 0.715, + "step": 1511, + "task_loss": 0.7294623255729675 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7012124061584473, + "epoch": 1.28, + "learning_rate": 4.845496383957923e-05, + "loss": 0.5627, + "step": 1512, + "task_loss": 0.8408117294311523 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.8051336407661438, + "epoch": 1.28, + "learning_rate": 4.8450267681036915e-05, + "loss": 0.6118, + "step": 1513, + "task_loss": 0.5096275806427002 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.9508499503135681, + "epoch": 1.28, + "learning_rate": 4.84455715224946e-05, + "loss": 0.7496, + "step": 1514, + "task_loss": 1.653529405593872 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3358496427536011, + "epoch": 1.28, + "learning_rate": 4.844087536395229e-05, + "loss": 0.6558, + "step": 1515, + "task_loss": 0.043991342186927795 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7455700635910034, + "epoch": 1.28, + "learning_rate": 4.8436179205409974e-05, + "loss": 0.6348, + "step": 1516, + "task_loss": 0.8194367289543152 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.515150785446167, + "epoch": 1.28, + "learning_rate": 4.843148304686767e-05, + "loss": 0.6803, + "step": 1517, + "task_loss": 1.407773733139038 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4168629050254822, + "epoch": 1.28, + "learning_rate": 4.842678688832535e-05, + "loss": 0.6127, + "step": 1518, + "task_loss": 0.5214310884475708 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7923804521560669, + "epoch": 1.28, + "learning_rate": 4.842209072978304e-05, + "loss": 0.8023, + "step": 1519, + "task_loss": 0.2299679070711136 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7684105634689331, + "epoch": 1.28, + "learning_rate": 4.8417394571240726e-05, + "loss": 0.7438, + "step": 1520, + "task_loss": 1.7137370109558105 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7027855515480042, + "epoch": 1.29, + "learning_rate": 4.841269841269841e-05, + "loss": 0.7348, + "step": 1521, + "task_loss": 1.4995673894882202 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.0628635883331299, + "epoch": 1.29, + "learning_rate": 4.8408002254156105e-05, + "loss": 0.629, + "step": 1522, + "task_loss": 1.3018572330474854 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.43159350752830505, + "epoch": 1.29, + "learning_rate": 4.840330609561379e-05, + "loss": 0.5522, + "step": 1523, + "task_loss": 0.08686469495296478 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.8554230332374573, + "epoch": 1.29, + "learning_rate": 4.839860993707148e-05, + "loss": 0.7022, + "step": 1524, + "task_loss": 0.9596909284591675 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.728079080581665, + "epoch": 1.29, + "learning_rate": 4.8393913778529164e-05, + "loss": 0.7114, + "step": 1525, + "task_loss": 1.3223060369491577 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3996451199054718, + "epoch": 1.29, + "learning_rate": 4.838921761998686e-05, + "loss": 0.5716, + "step": 1526, + "task_loss": 0.09382757544517517 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5382207632064819, + "epoch": 1.29, + "learning_rate": 4.8384521461444544e-05, + "loss": 0.6078, + "step": 1527, + "task_loss": 0.8313444256782532 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.609110951423645, + "epoch": 1.29, + "learning_rate": 4.837982530290222e-05, + "loss": 0.6282, + "step": 1528, + "task_loss": 0.4950845539569855 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6743806004524231, + "epoch": 1.29, + "learning_rate": 4.8375129144359916e-05, + "loss": 0.8634, + "step": 1529, + "task_loss": 1.0592788457870483 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5122683644294739, + "epoch": 1.29, + "learning_rate": 4.83704329858176e-05, + "loss": 0.6045, + "step": 1530, + "task_loss": 0.5953865051269531 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5554242134094238, + "epoch": 1.29, + "learning_rate": 4.8365736827275296e-05, + "loss": 0.8185, + "step": 1531, + "task_loss": 0.5615949630737305 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4926005005836487, + "epoch": 1.29, + "learning_rate": 4.8361040668732975e-05, + "loss": 0.5122, + "step": 1532, + "task_loss": 0.8189513683319092 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5228354930877686, + "epoch": 1.3, + "learning_rate": 4.835634451019067e-05, + "loss": 0.512, + "step": 1533, + "task_loss": 0.8986006379127502 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7789825797080994, + "epoch": 1.3, + "learning_rate": 4.8351648351648355e-05, + "loss": 0.6405, + "step": 1534, + "task_loss": 1.1608827114105225 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4193393588066101, + "epoch": 1.3, + "learning_rate": 4.834695219310604e-05, + "loss": 0.5979, + "step": 1535, + "task_loss": 0.5649136304855347 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.0244003534317017, + "epoch": 1.3, + "learning_rate": 4.834225603456373e-05, + "loss": 0.7614, + "step": 1536, + "task_loss": 0.6970483064651489 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6973425149917603, + "epoch": 1.3, + "learning_rate": 4.8337559876021414e-05, + "loss": 0.6144, + "step": 1537, + "task_loss": 1.1468579769134521 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.953492283821106, + "epoch": 1.3, + "learning_rate": 4.833286371747911e-05, + "loss": 0.5734, + "step": 1538, + "task_loss": 0.5555250644683838 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5364861488342285, + "epoch": 1.3, + "learning_rate": 4.832816755893679e-05, + "loss": 0.665, + "step": 1539, + "task_loss": 0.5105794668197632 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.1393046379089355, + "epoch": 1.3, + "learning_rate": 4.832347140039448e-05, + "loss": 0.7321, + "step": 1540, + "task_loss": 1.580297589302063 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6072041988372803, + "epoch": 1.3, + "learning_rate": 4.8318775241852166e-05, + "loss": 0.5387, + "step": 1541, + "task_loss": 0.4725264310836792 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7581283450126648, + "epoch": 1.3, + "learning_rate": 4.831407908330985e-05, + "loss": 0.7075, + "step": 1542, + "task_loss": 1.1358823776245117 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6209812760353088, + "epoch": 1.3, + "learning_rate": 4.8309382924767545e-05, + "loss": 0.8087, + "step": 1543, + "task_loss": 0.7734580636024475 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.075265884399414, + "epoch": 1.3, + "learning_rate": 4.830468676622523e-05, + "loss": 0.6882, + "step": 1544, + "task_loss": 2.2783830165863037 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.656969428062439, + "epoch": 1.31, + "learning_rate": 4.829999060768292e-05, + "loss": 0.5547, + "step": 1545, + "task_loss": 0.7023230195045471 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.44739753007888794, + "epoch": 1.31, + "learning_rate": 4.8295294449140604e-05, + "loss": 0.5315, + "step": 1546, + "task_loss": 0.4201110601425171 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5791067481040955, + "epoch": 1.31, + "learning_rate": 4.829059829059829e-05, + "loss": 0.7108, + "step": 1547, + "task_loss": 0.375360906124115 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7872514724731445, + "epoch": 1.31, + "learning_rate": 4.8285902132055984e-05, + "loss": 0.5944, + "step": 1548, + "task_loss": 1.5709960460662842 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.8588241338729858, + "epoch": 1.31, + "learning_rate": 4.8281205973513664e-05, + "loss": 0.6576, + "step": 1549, + "task_loss": 0.7646786570549011 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.8478990197181702, + "epoch": 1.31, + "learning_rate": 4.827650981497136e-05, + "loss": 0.6539, + "step": 1550, + "task_loss": 0.6614632606506348 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4807296097278595, + "epoch": 1.31, + "learning_rate": 4.827181365642904e-05, + "loss": 0.5384, + "step": 1551, + "task_loss": 0.3658294975757599 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6647264957427979, + "epoch": 1.31, + "learning_rate": 4.826711749788673e-05, + "loss": 0.8314, + "step": 1552, + "task_loss": 0.6389110088348389 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6121876835823059, + "epoch": 1.31, + "learning_rate": 4.826242133934442e-05, + "loss": 0.5632, + "step": 1553, + "task_loss": 1.105015754699707 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6053259372711182, + "epoch": 1.31, + "learning_rate": 4.82577251808021e-05, + "loss": 0.6133, + "step": 1554, + "task_loss": 0.7961587309837341 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4095574915409088, + "epoch": 1.31, + "learning_rate": 4.8253029022259795e-05, + "loss": 0.5772, + "step": 1555, + "task_loss": 0.6977249383926392 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6837360858917236, + "epoch": 1.32, + "learning_rate": 4.824833286371748e-05, + "loss": 0.7049, + "step": 1556, + "task_loss": 1.1659740209579468 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4575137794017792, + "epoch": 1.32, + "learning_rate": 4.8243636705175175e-05, + "loss": 0.5257, + "step": 1557, + "task_loss": 0.9253278374671936 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4212902784347534, + "epoch": 1.32, + "learning_rate": 4.8238940546632854e-05, + "loss": 0.702, + "step": 1558, + "task_loss": 0.20307455956935883 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3127292990684509, + "epoch": 1.32, + "learning_rate": 4.823424438809054e-05, + "loss": 0.5411, + "step": 1559, + "task_loss": 1.1613383293151855 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7592552900314331, + "epoch": 1.32, + "learning_rate": 4.8229548229548234e-05, + "loss": 0.7108, + "step": 1560, + "task_loss": 1.1608394384384155 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.9071587324142456, + "epoch": 1.32, + "learning_rate": 4.822485207100592e-05, + "loss": 0.7645, + "step": 1561, + "task_loss": 0.8745078444480896 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3181525766849518, + "epoch": 1.32, + "learning_rate": 4.8220155912463606e-05, + "loss": 0.4542, + "step": 1562, + "task_loss": 0.06940240412950516 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6186635494232178, + "epoch": 1.32, + "learning_rate": 4.821545975392129e-05, + "loss": 0.6455, + "step": 1563, + "task_loss": 0.578852653503418 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.410347044467926, + "epoch": 1.32, + "learning_rate": 4.8210763595378986e-05, + "loss": 0.6124, + "step": 1564, + "task_loss": 1.0148606300354004 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7263593673706055, + "epoch": 1.32, + "learning_rate": 4.820606743683667e-05, + "loss": 0.5382, + "step": 1565, + "task_loss": 0.4010551869869232 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.9422719478607178, + "epoch": 1.32, + "learning_rate": 4.820137127829435e-05, + "loss": 0.8576, + "step": 1566, + "task_loss": 0.9096721410751343 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5140295028686523, + "epoch": 1.32, + "learning_rate": 4.8196675119752045e-05, + "loss": 0.6439, + "step": 1567, + "task_loss": 0.4952224791049957 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7024980783462524, + "epoch": 1.33, + "learning_rate": 4.819197896120973e-05, + "loss": 0.5758, + "step": 1568, + "task_loss": 0.7159125208854675 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5138143301010132, + "epoch": 1.33, + "learning_rate": 4.8187282802667424e-05, + "loss": 0.6917, + "step": 1569, + "task_loss": 0.6480473279953003 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3256946802139282, + "epoch": 1.33, + "learning_rate": 4.818258664412511e-05, + "loss": 0.4723, + "step": 1570, + "task_loss": 0.7325423955917358 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5533435344696045, + "epoch": 1.33, + "learning_rate": 4.81778904855828e-05, + "loss": 0.5658, + "step": 1571, + "task_loss": 0.5203610062599182 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.592958390712738, + "epoch": 1.33, + "learning_rate": 4.817319432704048e-05, + "loss": 0.453, + "step": 1572, + "task_loss": 0.44023460149765015 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4569319486618042, + "epoch": 1.33, + "learning_rate": 4.816849816849817e-05, + "loss": 0.5174, + "step": 1573, + "task_loss": 0.10886258631944656 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4337739944458008, + "epoch": 1.33, + "learning_rate": 4.816380200995586e-05, + "loss": 0.4938, + "step": 1574, + "task_loss": 0.5731622576713562 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.43101778626441956, + "epoch": 1.33, + "learning_rate": 4.815910585141354e-05, + "loss": 0.4946, + "step": 1575, + "task_loss": 1.2174791097640991 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3706350326538086, + "epoch": 1.33, + "learning_rate": 4.8154409692871235e-05, + "loss": 0.3908, + "step": 1576, + "task_loss": 0.12853409349918365 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3911697268486023, + "epoch": 1.33, + "learning_rate": 4.814971353432892e-05, + "loss": 0.4099, + "step": 1577, + "task_loss": 0.2425040453672409 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.0136144161224365, + "epoch": 1.33, + "learning_rate": 4.814501737578661e-05, + "loss": 0.7654, + "step": 1578, + "task_loss": 0.28306055068969727 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5577731132507324, + "epoch": 1.33, + "learning_rate": 4.8140321217244294e-05, + "loss": 0.7658, + "step": 1579, + "task_loss": 1.4907143115997314 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6440142393112183, + "epoch": 1.34, + "learning_rate": 4.813562505870198e-05, + "loss": 0.6122, + "step": 1580, + "task_loss": 0.9424047470092773 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6828206181526184, + "epoch": 1.34, + "learning_rate": 4.8130928900159674e-05, + "loss": 0.706, + "step": 1581, + "task_loss": 0.8642055988311768 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.49305492639541626, + "epoch": 1.34, + "learning_rate": 4.812623274161736e-05, + "loss": 0.6297, + "step": 1582, + "task_loss": 1.0863714218139648 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2583116888999939, + "epoch": 1.34, + "learning_rate": 4.8121536583075046e-05, + "loss": 0.4294, + "step": 1583, + "task_loss": 0.6684529185295105 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7939484119415283, + "epoch": 1.34, + "learning_rate": 4.811684042453273e-05, + "loss": 0.6723, + "step": 1584, + "task_loss": 1.183083415031433 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.0646734237670898, + "epoch": 1.34, + "learning_rate": 4.811214426599042e-05, + "loss": 0.7159, + "step": 1585, + "task_loss": 0.5680029392242432 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5453145503997803, + "epoch": 1.34, + "learning_rate": 4.810744810744811e-05, + "loss": 0.5912, + "step": 1586, + "task_loss": 0.18548765778541565 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6063659191131592, + "epoch": 1.34, + "learning_rate": 4.81027519489058e-05, + "loss": 0.5761, + "step": 1587, + "task_loss": 0.41710150241851807 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7054169178009033, + "epoch": 1.34, + "learning_rate": 4.8098055790363485e-05, + "loss": 0.6263, + "step": 1588, + "task_loss": 0.5029739737510681 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3755860924720764, + "epoch": 1.34, + "learning_rate": 4.809335963182117e-05, + "loss": 0.5577, + "step": 1589, + "task_loss": 0.47138115763664246 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5435585975646973, + "epoch": 1.34, + "learning_rate": 4.808866347327886e-05, + "loss": 0.7374, + "step": 1590, + "task_loss": 0.7633550763130188 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7981275916099548, + "epoch": 1.34, + "learning_rate": 4.808396731473655e-05, + "loss": 0.5895, + "step": 1591, + "task_loss": 0.7119269371032715 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.48303720355033875, + "epoch": 1.35, + "learning_rate": 4.807927115619423e-05, + "loss": 0.4605, + "step": 1592, + "task_loss": 1.0852748155593872 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7031118273735046, + "epoch": 1.35, + "learning_rate": 4.8074574997651923e-05, + "loss": 0.5205, + "step": 1593, + "task_loss": 1.1663708686828613 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4383600652217865, + "epoch": 1.35, + "learning_rate": 4.806987883910961e-05, + "loss": 0.5212, + "step": 1594, + "task_loss": 0.9021636843681335 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.418383926153183, + "epoch": 1.35, + "learning_rate": 4.80651826805673e-05, + "loss": 0.5409, + "step": 1595, + "task_loss": 0.13773605227470398 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3269578814506531, + "epoch": 1.35, + "learning_rate": 4.806048652202498e-05, + "loss": 0.6929, + "step": 1596, + "task_loss": 1.1575239896774292 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.44816088676452637, + "epoch": 1.35, + "learning_rate": 4.8055790363482676e-05, + "loss": 0.5433, + "step": 1597, + "task_loss": 0.7787641286849976 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4737761616706848, + "epoch": 1.35, + "learning_rate": 4.805109420494036e-05, + "loss": 0.587, + "step": 1598, + "task_loss": 0.6614913940429688 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.970577597618103, + "epoch": 1.35, + "learning_rate": 4.804639804639805e-05, + "loss": 0.6996, + "step": 1599, + "task_loss": 1.2826341390609741 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5127413272857666, + "epoch": 1.35, + "learning_rate": 4.804170188785574e-05, + "loss": 0.6209, + "step": 1600, + "task_loss": 0.5460997819900513 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.36214709281921387, + "epoch": 1.35, + "learning_rate": 4.803700572931342e-05, + "loss": 0.6312, + "step": 1601, + "task_loss": 1.021809697151184 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5148459672927856, + "epoch": 1.35, + "learning_rate": 4.8032309570771114e-05, + "loss": 0.611, + "step": 1602, + "task_loss": 0.3887481987476349 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6700729727745056, + "epoch": 1.35, + "learning_rate": 4.80276134122288e-05, + "loss": 0.5438, + "step": 1603, + "task_loss": 0.4404171407222748 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5128576159477234, + "epoch": 1.36, + "learning_rate": 4.802291725368649e-05, + "loss": 0.6531, + "step": 1604, + "task_loss": 0.3466219902038574 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5506155490875244, + "epoch": 1.36, + "learning_rate": 4.801822109514417e-05, + "loss": 0.6177, + "step": 1605, + "task_loss": 0.17074881494045258 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.846977949142456, + "epoch": 1.36, + "learning_rate": 4.801352493660186e-05, + "loss": 0.601, + "step": 1606, + "task_loss": 0.7702576518058777 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.588234543800354, + "epoch": 1.36, + "learning_rate": 4.800882877805955e-05, + "loss": 0.6555, + "step": 1607, + "task_loss": 0.9942833185195923 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6817035675048828, + "epoch": 1.36, + "learning_rate": 4.800413261951724e-05, + "loss": 0.7148, + "step": 1608, + "task_loss": 0.9131969213485718 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.43112996220588684, + "epoch": 1.36, + "learning_rate": 4.7999436460974925e-05, + "loss": 0.5319, + "step": 1609, + "task_loss": 0.9301946759223938 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.846962571144104, + "epoch": 1.36, + "learning_rate": 4.799474030243261e-05, + "loss": 0.7697, + "step": 1610, + "task_loss": 1.4009406566619873 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.8207221627235413, + "epoch": 1.36, + "learning_rate": 4.79900441438903e-05, + "loss": 0.8219, + "step": 1611, + "task_loss": 0.9331570863723755 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6331489086151123, + "epoch": 1.36, + "learning_rate": 4.798534798534799e-05, + "loss": 0.662, + "step": 1612, + "task_loss": 0.7073463201522827 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4149402379989624, + "epoch": 1.36, + "learning_rate": 4.798065182680567e-05, + "loss": 0.628, + "step": 1613, + "task_loss": 0.58563232421875 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6279155015945435, + "epoch": 1.36, + "learning_rate": 4.7975955668263364e-05, + "loss": 0.6153, + "step": 1614, + "task_loss": 1.2558414936065674 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.40521812438964844, + "epoch": 1.36, + "learning_rate": 4.797125950972105e-05, + "loss": 0.5038, + "step": 1615, + "task_loss": 0.4520301818847656 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4847283661365509, + "epoch": 1.37, + "learning_rate": 4.7966563351178736e-05, + "loss": 0.5033, + "step": 1616, + "task_loss": 0.4998977482318878 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5778762102127075, + "epoch": 1.37, + "learning_rate": 4.796186719263643e-05, + "loss": 0.5275, + "step": 1617, + "task_loss": 0.6062279343605042 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3704608082771301, + "epoch": 1.37, + "learning_rate": 4.795717103409411e-05, + "loss": 0.5168, + "step": 1618, + "task_loss": 0.7343268990516663 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.45506542921066284, + "epoch": 1.37, + "learning_rate": 4.79524748755518e-05, + "loss": 0.5677, + "step": 1619, + "task_loss": 0.35434967279434204 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.37402915954589844, + "epoch": 1.37, + "learning_rate": 4.794777871700949e-05, + "loss": 0.5393, + "step": 1620, + "task_loss": 0.035643644630908966 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.0044082403182983, + "epoch": 1.37, + "learning_rate": 4.794308255846718e-05, + "loss": 0.707, + "step": 1621, + "task_loss": 0.9272604584693909 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4245780110359192, + "epoch": 1.37, + "learning_rate": 4.793838639992486e-05, + "loss": 0.5962, + "step": 1622, + "task_loss": 0.5920388102531433 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4326555132865906, + "epoch": 1.37, + "learning_rate": 4.793369024138255e-05, + "loss": 0.4418, + "step": 1623, + "task_loss": 1.226893424987793 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6501697897911072, + "epoch": 1.37, + "learning_rate": 4.792899408284024e-05, + "loss": 0.6174, + "step": 1624, + "task_loss": 1.0539603233337402 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6324215531349182, + "epoch": 1.37, + "learning_rate": 4.792429792429793e-05, + "loss": 0.6366, + "step": 1625, + "task_loss": 0.9670824408531189 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.791627824306488, + "epoch": 1.37, + "learning_rate": 4.791960176575561e-05, + "loss": 0.6822, + "step": 1626, + "task_loss": 0.6989907026290894 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6285141706466675, + "epoch": 1.38, + "learning_rate": 4.79149056072133e-05, + "loss": 0.5556, + "step": 1627, + "task_loss": 0.9508346915245056 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7085584402084351, + "epoch": 1.38, + "learning_rate": 4.791020944867099e-05, + "loss": 0.5969, + "step": 1628, + "task_loss": 0.7011109590530396 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7885511517524719, + "epoch": 1.38, + "learning_rate": 4.790551329012868e-05, + "loss": 0.7065, + "step": 1629, + "task_loss": 0.668955385684967 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7590125799179077, + "epoch": 1.38, + "learning_rate": 4.7900817131586365e-05, + "loss": 0.6951, + "step": 1630, + "task_loss": 0.8794214725494385 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6715810298919678, + "epoch": 1.38, + "learning_rate": 4.789612097304405e-05, + "loss": 0.5579, + "step": 1631, + "task_loss": 1.2304558753967285 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5521953105926514, + "epoch": 1.38, + "learning_rate": 4.789142481450174e-05, + "loss": 0.6949, + "step": 1632, + "task_loss": 1.133043646812439 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2727198600769043, + "epoch": 1.38, + "learning_rate": 4.788672865595943e-05, + "loss": 0.4784, + "step": 1633, + "task_loss": 0.20749114453792572 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6448957324028015, + "epoch": 1.38, + "learning_rate": 4.788203249741712e-05, + "loss": 0.8084, + "step": 1634, + "task_loss": 1.107852816581726 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.0230878591537476, + "epoch": 1.38, + "learning_rate": 4.7877336338874804e-05, + "loss": 0.7094, + "step": 1635, + "task_loss": 0.8893623352050781 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6867352724075317, + "epoch": 1.38, + "learning_rate": 4.787264018033249e-05, + "loss": 0.6499, + "step": 1636, + "task_loss": 1.5206507444381714 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5930062532424927, + "epoch": 1.38, + "learning_rate": 4.7867944021790177e-05, + "loss": 0.6019, + "step": 1637, + "task_loss": 0.4308053255081177 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.9250055551528931, + "epoch": 1.38, + "learning_rate": 4.786324786324787e-05, + "loss": 0.8052, + "step": 1638, + "task_loss": 1.394662857055664 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6298177242279053, + "epoch": 1.39, + "learning_rate": 4.785855170470555e-05, + "loss": 0.6658, + "step": 1639, + "task_loss": 0.5260576605796814 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.625712513923645, + "epoch": 1.39, + "learning_rate": 4.785385554616324e-05, + "loss": 0.6869, + "step": 1640, + "task_loss": 1.094031572341919 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3023645281791687, + "epoch": 1.39, + "learning_rate": 4.784915938762093e-05, + "loss": 0.5112, + "step": 1641, + "task_loss": 0.6468602418899536 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2255578339099884, + "epoch": 1.39, + "learning_rate": 4.7844463229078615e-05, + "loss": 0.5208, + "step": 1642, + "task_loss": 0.02285122126340866 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5913540720939636, + "epoch": 1.39, + "learning_rate": 4.78397670705363e-05, + "loss": 0.6996, + "step": 1643, + "task_loss": 0.5406564474105835 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.9128932952880859, + "epoch": 1.39, + "learning_rate": 4.783507091199399e-05, + "loss": 0.5013, + "step": 1644, + "task_loss": 0.7289896011352539 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.8327547311782837, + "epoch": 1.39, + "learning_rate": 4.783037475345168e-05, + "loss": 0.6093, + "step": 1645, + "task_loss": 0.783568263053894 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3405892550945282, + "epoch": 1.39, + "learning_rate": 4.782567859490937e-05, + "loss": 0.5788, + "step": 1646, + "task_loss": 0.27206966280937195 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6549792885780334, + "epoch": 1.39, + "learning_rate": 4.7820982436367054e-05, + "loss": 0.566, + "step": 1647, + "task_loss": 1.1901692152023315 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6471213102340698, + "epoch": 1.39, + "learning_rate": 4.781628627782474e-05, + "loss": 0.6159, + "step": 1648, + "task_loss": 0.6256771683692932 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6229049563407898, + "epoch": 1.39, + "learning_rate": 4.7811590119282426e-05, + "loss": 0.6869, + "step": 1649, + "task_loss": 1.2939614057540894 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4963051974773407, + "epoch": 1.39, + "learning_rate": 4.780689396074012e-05, + "loss": 0.6677, + "step": 1650, + "task_loss": 0.6176434755325317 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5941067934036255, + "epoch": 1.4, + "learning_rate": 4.7802197802197806e-05, + "loss": 0.6619, + "step": 1651, + "task_loss": 0.6285609602928162 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4486173987388611, + "epoch": 1.4, + "learning_rate": 4.779750164365549e-05, + "loss": 0.5563, + "step": 1652, + "task_loss": 0.2463979572057724 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3717479407787323, + "epoch": 1.4, + "learning_rate": 4.779280548511318e-05, + "loss": 0.5078, + "step": 1653, + "task_loss": 0.27591925859451294 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.582754373550415, + "epoch": 1.4, + "learning_rate": 4.7788109326570865e-05, + "loss": 0.6038, + "step": 1654, + "task_loss": 1.1268489360809326 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3973112106323242, + "epoch": 1.4, + "learning_rate": 4.778341316802856e-05, + "loss": 0.5645, + "step": 1655, + "task_loss": 0.13454020023345947 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6616090536117554, + "epoch": 1.4, + "learning_rate": 4.777871700948624e-05, + "loss": 0.6783, + "step": 1656, + "task_loss": 0.3349473476409912 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.42962509393692017, + "epoch": 1.4, + "learning_rate": 4.777402085094393e-05, + "loss": 0.6321, + "step": 1657, + "task_loss": 1.292642593383789 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3535487949848175, + "epoch": 1.4, + "learning_rate": 4.776932469240162e-05, + "loss": 0.5843, + "step": 1658, + "task_loss": 0.11202388256788254 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6484713554382324, + "epoch": 1.4, + "learning_rate": 4.776462853385931e-05, + "loss": 0.4678, + "step": 1659, + "task_loss": 1.1821577548980713 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.44544318318367004, + "epoch": 1.4, + "learning_rate": 4.775993237531699e-05, + "loss": 0.5599, + "step": 1660, + "task_loss": 0.16938871145248413 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7618705034255981, + "epoch": 1.4, + "learning_rate": 4.7755236216774676e-05, + "loss": 0.7092, + "step": 1661, + "task_loss": 0.6327065229415894 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.45954591035842896, + "epoch": 1.4, + "learning_rate": 4.775054005823237e-05, + "loss": 0.4335, + "step": 1662, + "task_loss": 0.337223619222641 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6329273581504822, + "epoch": 1.41, + "learning_rate": 4.7745843899690055e-05, + "loss": 0.5844, + "step": 1663, + "task_loss": 0.7710459232330322 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.49368688464164734, + "epoch": 1.41, + "learning_rate": 4.774114774114775e-05, + "loss": 0.6015, + "step": 1664, + "task_loss": 0.31707435846328735 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4891490638256073, + "epoch": 1.41, + "learning_rate": 4.773645158260543e-05, + "loss": 0.5338, + "step": 1665, + "task_loss": 0.6553794145584106 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.0766609907150269, + "epoch": 1.41, + "learning_rate": 4.773175542406312e-05, + "loss": 0.7095, + "step": 1666, + "task_loss": 1.0282387733459473 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5905341506004333, + "epoch": 1.41, + "learning_rate": 4.772705926552081e-05, + "loss": 0.732, + "step": 1667, + "task_loss": 1.3435232639312744 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7805784940719604, + "epoch": 1.41, + "learning_rate": 4.7722363106978494e-05, + "loss": 0.5006, + "step": 1668, + "task_loss": 0.588904857635498 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2207416146993637, + "epoch": 1.41, + "learning_rate": 4.771766694843618e-05, + "loss": 0.5146, + "step": 1669, + "task_loss": 0.9002301096916199 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4912632703781128, + "epoch": 1.41, + "learning_rate": 4.7712970789893866e-05, + "loss": 0.5455, + "step": 1670, + "task_loss": 0.4370778799057007 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.341635525226593, + "epoch": 1.41, + "learning_rate": 4.770827463135156e-05, + "loss": 0.5415, + "step": 1671, + "task_loss": 0.6288416385650635 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5023935437202454, + "epoch": 1.41, + "learning_rate": 4.7703578472809246e-05, + "loss": 0.6909, + "step": 1672, + "task_loss": 0.48191872239112854 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4665360748767853, + "epoch": 1.41, + "learning_rate": 4.769888231426693e-05, + "loss": 0.7911, + "step": 1673, + "task_loss": 0.6535815000534058 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5330772995948792, + "epoch": 1.41, + "learning_rate": 4.769418615572462e-05, + "loss": 0.5724, + "step": 1674, + "task_loss": 0.3654189705848694 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5425856113433838, + "epoch": 1.42, + "learning_rate": 4.7689489997182305e-05, + "loss": 0.4697, + "step": 1675, + "task_loss": 0.39765316247940063 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.46999087929725647, + "epoch": 1.42, + "learning_rate": 4.768479383864e-05, + "loss": 0.4343, + "step": 1676, + "task_loss": 0.8304810523986816 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.9027906656265259, + "epoch": 1.42, + "learning_rate": 4.7680097680097684e-05, + "loss": 0.62, + "step": 1677, + "task_loss": 0.5658296346664429 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.71793532371521, + "epoch": 1.42, + "learning_rate": 4.767540152155537e-05, + "loss": 0.7569, + "step": 1678, + "task_loss": 0.675432562828064 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5552042722702026, + "epoch": 1.42, + "learning_rate": 4.767070536301306e-05, + "loss": 0.5337, + "step": 1679, + "task_loss": 0.9523319005966187 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.44727009534835815, + "epoch": 1.42, + "learning_rate": 4.7666009204470743e-05, + "loss": 0.5384, + "step": 1680, + "task_loss": 1.0604907274246216 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6641278862953186, + "epoch": 1.42, + "learning_rate": 4.7661313045928437e-05, + "loss": 0.5178, + "step": 1681, + "task_loss": 1.0119487047195435 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.39726608991622925, + "epoch": 1.42, + "learning_rate": 4.7656616887386116e-05, + "loss": 0.4341, + "step": 1682, + "task_loss": 0.20627453923225403 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5769674777984619, + "epoch": 1.42, + "learning_rate": 4.765192072884381e-05, + "loss": 0.4123, + "step": 1683, + "task_loss": 0.10757043957710266 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4141567349433899, + "epoch": 1.42, + "learning_rate": 4.7647224570301496e-05, + "loss": 0.5555, + "step": 1684, + "task_loss": 0.9611308574676514 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6603943705558777, + "epoch": 1.42, + "learning_rate": 4.764252841175918e-05, + "loss": 0.7677, + "step": 1685, + "task_loss": 0.34366485476493835 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6432027816772461, + "epoch": 1.42, + "learning_rate": 4.763783225321687e-05, + "loss": 0.5967, + "step": 1686, + "task_loss": 0.7625919580459595 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3677903115749359, + "epoch": 1.43, + "learning_rate": 4.7633136094674555e-05, + "loss": 0.5526, + "step": 1687, + "task_loss": 0.5651863217353821 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.8282896280288696, + "epoch": 1.43, + "learning_rate": 4.762843993613225e-05, + "loss": 0.733, + "step": 1688, + "task_loss": 0.7544581294059753 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3845669627189636, + "epoch": 1.43, + "learning_rate": 4.7623743777589934e-05, + "loss": 0.5428, + "step": 1689, + "task_loss": 0.419605016708374 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4004131555557251, + "epoch": 1.43, + "learning_rate": 4.761904761904762e-05, + "loss": 0.6542, + "step": 1690, + "task_loss": 0.28962236642837524 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.37988877296447754, + "epoch": 1.43, + "learning_rate": 4.761435146050531e-05, + "loss": 0.5331, + "step": 1691, + "task_loss": 1.2778712511062622 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.9036530256271362, + "epoch": 1.43, + "learning_rate": 4.7609655301963e-05, + "loss": 0.4934, + "step": 1692, + "task_loss": 0.9585182070732117 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3795052170753479, + "epoch": 1.43, + "learning_rate": 4.7604959143420686e-05, + "loss": 0.5756, + "step": 1693, + "task_loss": 0.060575101524591446 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3176575303077698, + "epoch": 1.43, + "learning_rate": 4.760026298487837e-05, + "loss": 0.5541, + "step": 1694, + "task_loss": 0.04321736469864845 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4549373984336853, + "epoch": 1.43, + "learning_rate": 4.759556682633606e-05, + "loss": 0.5903, + "step": 1695, + "task_loss": 0.9013234376907349 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5707936882972717, + "epoch": 1.43, + "learning_rate": 4.7590870667793745e-05, + "loss": 0.5708, + "step": 1696, + "task_loss": 1.176888346672058 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7325726747512817, + "epoch": 1.43, + "learning_rate": 4.758617450925144e-05, + "loss": 0.7259, + "step": 1697, + "task_loss": 1.5041282176971436 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4337136447429657, + "epoch": 1.44, + "learning_rate": 4.7581478350709125e-05, + "loss": 0.6171, + "step": 1698, + "task_loss": 0.9632522463798523 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.8745871782302856, + "epoch": 1.44, + "learning_rate": 4.757678219216681e-05, + "loss": 0.7305, + "step": 1699, + "task_loss": 2.1209444999694824 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.42600828409194946, + "epoch": 1.44, + "learning_rate": 4.75720860336245e-05, + "loss": 0.8298, + "step": 1700, + "task_loss": 0.7192578315734863 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.38154494762420654, + "epoch": 1.44, + "learning_rate": 4.7567389875082184e-05, + "loss": 0.6096, + "step": 1701, + "task_loss": 0.9794694781303406 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3135108947753906, + "epoch": 1.44, + "learning_rate": 4.756269371653988e-05, + "loss": 0.4503, + "step": 1702, + "task_loss": 0.6690059900283813 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.45923948287963867, + "epoch": 1.44, + "learning_rate": 4.7557997557997556e-05, + "loss": 0.5694, + "step": 1703, + "task_loss": 0.9617040157318115 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4590805172920227, + "epoch": 1.44, + "learning_rate": 4.755330139945525e-05, + "loss": 0.5601, + "step": 1704, + "task_loss": 0.3061573803424835 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4680730104446411, + "epoch": 1.44, + "learning_rate": 4.7548605240912936e-05, + "loss": 0.7121, + "step": 1705, + "task_loss": 0.8845243453979492 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3977121114730835, + "epoch": 1.44, + "learning_rate": 4.754390908237062e-05, + "loss": 0.5876, + "step": 1706, + "task_loss": 1.1434568166732788 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4672061800956726, + "epoch": 1.44, + "learning_rate": 4.7539212923828315e-05, + "loss": 0.5551, + "step": 1707, + "task_loss": 0.5394863486289978 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3127460181713104, + "epoch": 1.44, + "learning_rate": 4.7534516765285995e-05, + "loss": 0.4342, + "step": 1708, + "task_loss": 0.1933390200138092 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.2080340385437012, + "epoch": 1.44, + "learning_rate": 4.752982060674369e-05, + "loss": 0.6746, + "step": 1709, + "task_loss": 0.8523832559585571 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.2012395858764648, + "epoch": 1.45, + "learning_rate": 4.7525124448201374e-05, + "loss": 0.7143, + "step": 1710, + "task_loss": 0.8809433579444885 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.685310959815979, + "epoch": 1.45, + "learning_rate": 4.752042828965906e-05, + "loss": 0.7435, + "step": 1711, + "task_loss": 1.413419485092163 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5993162989616394, + "epoch": 1.45, + "learning_rate": 4.751573213111675e-05, + "loss": 0.5446, + "step": 1712, + "task_loss": 0.9045497179031372 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3171706199645996, + "epoch": 1.45, + "learning_rate": 4.751103597257443e-05, + "loss": 0.5578, + "step": 1713, + "task_loss": 0.8205767273902893 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5081040859222412, + "epoch": 1.45, + "learning_rate": 4.7506339814032126e-05, + "loss": 0.5731, + "step": 1714, + "task_loss": 1.8179906606674194 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5296933054924011, + "epoch": 1.45, + "learning_rate": 4.750164365548981e-05, + "loss": 0.5222, + "step": 1715, + "task_loss": 0.4145263731479645 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5474762916564941, + "epoch": 1.45, + "learning_rate": 4.74969474969475e-05, + "loss": 0.601, + "step": 1716, + "task_loss": 0.4050251543521881 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5535275340080261, + "epoch": 1.45, + "learning_rate": 4.7492251338405185e-05, + "loss": 0.6063, + "step": 1717, + "task_loss": 0.8784699440002441 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5403037667274475, + "epoch": 1.45, + "learning_rate": 4.748755517986287e-05, + "loss": 0.5823, + "step": 1718, + "task_loss": 1.4323643445968628 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6731171607971191, + "epoch": 1.45, + "learning_rate": 4.7482859021320565e-05, + "loss": 0.6806, + "step": 1719, + "task_loss": 0.6315076947212219 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6189766526222229, + "epoch": 1.45, + "learning_rate": 4.7478162862778244e-05, + "loss": 0.5909, + "step": 1720, + "task_loss": 1.31588613986969 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7352336645126343, + "epoch": 1.45, + "learning_rate": 4.747346670423594e-05, + "loss": 0.4898, + "step": 1721, + "task_loss": 1.1063120365142822 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5604705214500427, + "epoch": 1.46, + "learning_rate": 4.7468770545693624e-05, + "loss": 0.6487, + "step": 1722, + "task_loss": 0.7263510227203369 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.434240460395813, + "epoch": 1.46, + "learning_rate": 4.746407438715132e-05, + "loss": 0.584, + "step": 1723, + "task_loss": 0.5699338912963867 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5256175994873047, + "epoch": 1.46, + "learning_rate": 4.7459378228609e-05, + "loss": 0.6261, + "step": 1724, + "task_loss": 0.641033411026001 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6866698265075684, + "epoch": 1.46, + "learning_rate": 4.745468207006668e-05, + "loss": 0.7851, + "step": 1725, + "task_loss": 1.4354498386383057 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5095258951187134, + "epoch": 1.46, + "learning_rate": 4.7449985911524376e-05, + "loss": 0.5491, + "step": 1726, + "task_loss": 0.7234744429588318 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4450724720954895, + "epoch": 1.46, + "learning_rate": 4.744528975298206e-05, + "loss": 0.5529, + "step": 1727, + "task_loss": 0.2879394292831421 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.353641152381897, + "epoch": 1.46, + "learning_rate": 4.7440593594439755e-05, + "loss": 0.4256, + "step": 1728, + "task_loss": 0.29519811272621155 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.360676646232605, + "epoch": 1.46, + "learning_rate": 4.7435897435897435e-05, + "loss": 0.5496, + "step": 1729, + "task_loss": 0.11224878579378128 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.8293690085411072, + "epoch": 1.46, + "learning_rate": 4.743120127735513e-05, + "loss": 0.5148, + "step": 1730, + "task_loss": 0.5572472810745239 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5536460280418396, + "epoch": 1.46, + "learning_rate": 4.7426505118812814e-05, + "loss": 0.5264, + "step": 1731, + "task_loss": 0.5107840299606323 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7914451360702515, + "epoch": 1.46, + "learning_rate": 4.74218089602705e-05, + "loss": 0.6965, + "step": 1732, + "task_loss": 0.5417155623435974 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4786778688430786, + "epoch": 1.46, + "learning_rate": 4.741711280172819e-05, + "loss": 0.7089, + "step": 1733, + "task_loss": 0.8589772582054138 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5884360074996948, + "epoch": 1.47, + "learning_rate": 4.7412416643185874e-05, + "loss": 0.5197, + "step": 1734, + "task_loss": 1.531866431236267 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.47132694721221924, + "epoch": 1.47, + "learning_rate": 4.740772048464357e-05, + "loss": 0.4069, + "step": 1735, + "task_loss": 0.3693479001522064 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5266053676605225, + "epoch": 1.47, + "learning_rate": 4.740302432610125e-05, + "loss": 0.4947, + "step": 1736, + "task_loss": 0.7525853514671326 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.9121435880661011, + "epoch": 1.47, + "learning_rate": 4.739832816755894e-05, + "loss": 0.5628, + "step": 1737, + "task_loss": 0.5331129431724548 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5926668643951416, + "epoch": 1.47, + "learning_rate": 4.7393632009016626e-05, + "loss": 0.5182, + "step": 1738, + "task_loss": 0.9783690571784973 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.422587513923645, + "epoch": 1.47, + "learning_rate": 4.738893585047431e-05, + "loss": 0.5913, + "step": 1739, + "task_loss": 0.43464240431785583 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4938250482082367, + "epoch": 1.47, + "learning_rate": 4.7384239691932005e-05, + "loss": 0.6059, + "step": 1740, + "task_loss": 0.38438811898231506 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.35957518219947815, + "epoch": 1.47, + "learning_rate": 4.737954353338969e-05, + "loss": 0.4843, + "step": 1741, + "task_loss": 1.6396713256835938 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5178916454315186, + "epoch": 1.47, + "learning_rate": 4.737484737484738e-05, + "loss": 0.5548, + "step": 1742, + "task_loss": 1.6556224822998047 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6585602164268494, + "epoch": 1.47, + "learning_rate": 4.7370151216305064e-05, + "loss": 0.563, + "step": 1743, + "task_loss": 0.8948805332183838 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.8171379566192627, + "epoch": 1.47, + "learning_rate": 4.736545505776275e-05, + "loss": 0.6025, + "step": 1744, + "task_loss": 0.8966345191001892 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5017954111099243, + "epoch": 1.47, + "learning_rate": 4.7360758899220444e-05, + "loss": 0.6999, + "step": 1745, + "task_loss": 0.21828439831733704 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5911415815353394, + "epoch": 1.48, + "learning_rate": 4.735606274067812e-05, + "loss": 0.5837, + "step": 1746, + "task_loss": 1.5282588005065918 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5401361584663391, + "epoch": 1.48, + "learning_rate": 4.7351366582135816e-05, + "loss": 0.5384, + "step": 1747, + "task_loss": 0.28108352422714233 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6684252023696899, + "epoch": 1.48, + "learning_rate": 4.73466704235935e-05, + "loss": 0.672, + "step": 1748, + "task_loss": 1.193454623222351 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4978240728378296, + "epoch": 1.48, + "learning_rate": 4.734197426505119e-05, + "loss": 0.5566, + "step": 1749, + "task_loss": 0.9841524958610535 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4638376832008362, + "epoch": 1.48, + "learning_rate": 4.7337278106508875e-05, + "loss": 0.5078, + "step": 1750, + "task_loss": 0.8538601994514465 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5723704695701599, + "epoch": 1.48, + "learning_rate": 4.733258194796656e-05, + "loss": 0.5516, + "step": 1751, + "task_loss": 1.0360629558563232 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.32406219840049744, + "epoch": 1.48, + "learning_rate": 4.7327885789424255e-05, + "loss": 0.5789, + "step": 1752, + "task_loss": 0.2305552214384079 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3213307857513428, + "epoch": 1.48, + "learning_rate": 4.732318963088194e-05, + "loss": 0.4538, + "step": 1753, + "task_loss": 0.4560941457748413 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5032540559768677, + "epoch": 1.48, + "learning_rate": 4.7318493472339634e-05, + "loss": 0.5827, + "step": 1754, + "task_loss": 0.6187043786048889 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5370607972145081, + "epoch": 1.48, + "learning_rate": 4.7313797313797314e-05, + "loss": 0.6107, + "step": 1755, + "task_loss": 0.6384979486465454 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5159317851066589, + "epoch": 1.48, + "learning_rate": 4.7309101155255e-05, + "loss": 0.4999, + "step": 1756, + "task_loss": 0.587050199508667 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7169076800346375, + "epoch": 1.48, + "learning_rate": 4.730440499671269e-05, + "loss": 0.7053, + "step": 1757, + "task_loss": 1.3134206533432007 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7701461315155029, + "epoch": 1.49, + "learning_rate": 4.729970883817038e-05, + "loss": 0.7, + "step": 1758, + "task_loss": 1.1118028163909912 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4240128993988037, + "epoch": 1.49, + "learning_rate": 4.7295012679628066e-05, + "loss": 0.608, + "step": 1759, + "task_loss": 0.4279307425022125 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.48941919207572937, + "epoch": 1.49, + "learning_rate": 4.729031652108575e-05, + "loss": 0.7093, + "step": 1760, + "task_loss": 2.10709810256958 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6665080785751343, + "epoch": 1.49, + "learning_rate": 4.7285620362543445e-05, + "loss": 0.6386, + "step": 1761, + "task_loss": 1.0828911066055298 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7394925355911255, + "epoch": 1.49, + "learning_rate": 4.728092420400113e-05, + "loss": 0.7404, + "step": 1762, + "task_loss": 0.8737367987632751 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4843217730522156, + "epoch": 1.49, + "learning_rate": 4.727622804545882e-05, + "loss": 0.5516, + "step": 1763, + "task_loss": 0.2954407036304474 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3920549154281616, + "epoch": 1.49, + "learning_rate": 4.7271531886916504e-05, + "loss": 0.5014, + "step": 1764, + "task_loss": 0.11834452301263809 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2948736548423767, + "epoch": 1.49, + "learning_rate": 4.726683572837419e-05, + "loss": 0.5344, + "step": 1765, + "task_loss": 0.13339585065841675 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.43370670080184937, + "epoch": 1.49, + "learning_rate": 4.7262139569831884e-05, + "loss": 0.5657, + "step": 1766, + "task_loss": 0.42753562331199646 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.9254080057144165, + "epoch": 1.49, + "learning_rate": 4.725744341128956e-05, + "loss": 0.6812, + "step": 1767, + "task_loss": 0.9892339706420898 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.39960789680480957, + "epoch": 1.49, + "learning_rate": 4.7252747252747257e-05, + "loss": 0.5428, + "step": 1768, + "task_loss": 0.6254394054412842 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.0269381999969482, + "epoch": 1.5, + "learning_rate": 4.724805109420494e-05, + "loss": 0.618, + "step": 1769, + "task_loss": 0.8685317635536194 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4588848650455475, + "epoch": 1.5, + "learning_rate": 4.724335493566263e-05, + "loss": 0.6104, + "step": 1770, + "task_loss": 1.3435319662094116 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.446067750453949, + "epoch": 1.5, + "learning_rate": 4.723865877712032e-05, + "loss": 0.542, + "step": 1771, + "task_loss": 0.4482569098472595 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.47300654649734497, + "epoch": 1.5, + "learning_rate": 4.7233962618578e-05, + "loss": 0.6161, + "step": 1772, + "task_loss": 1.1608705520629883 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7786028385162354, + "epoch": 1.5, + "learning_rate": 4.7229266460035695e-05, + "loss": 0.6448, + "step": 1773, + "task_loss": 0.9818819761276245 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.8129267692565918, + "epoch": 1.5, + "learning_rate": 4.722457030149338e-05, + "loss": 0.5489, + "step": 1774, + "task_loss": 1.4871063232421875 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5099161863327026, + "epoch": 1.5, + "learning_rate": 4.721987414295107e-05, + "loss": 0.4436, + "step": 1775, + "task_loss": 0.28026899695396423 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.8355551362037659, + "epoch": 1.5, + "learning_rate": 4.7215177984408754e-05, + "loss": 0.6905, + "step": 1776, + "task_loss": 1.0886460542678833 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5663450956344604, + "epoch": 1.5, + "learning_rate": 4.721048182586644e-05, + "loss": 0.5125, + "step": 1777, + "task_loss": 0.7053067684173584 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6802136898040771, + "epoch": 1.5, + "learning_rate": 4.7205785667324133e-05, + "loss": 0.4994, + "step": 1778, + "task_loss": 0.5682879686355591 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4849269688129425, + "epoch": 1.5, + "learning_rate": 4.720108950878182e-05, + "loss": 0.4508, + "step": 1779, + "task_loss": 0.18343780934810638 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.30709266662597656, + "epoch": 1.5, + "learning_rate": 4.7196393350239506e-05, + "loss": 0.5454, + "step": 1780, + "task_loss": 1.104981780052185 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.8152207732200623, + "epoch": 1.51, + "learning_rate": 4.719169719169719e-05, + "loss": 0.7672, + "step": 1781, + "task_loss": 0.9177175760269165 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4609120488166809, + "epoch": 1.51, + "learning_rate": 4.718700103315488e-05, + "loss": 0.5844, + "step": 1782, + "task_loss": 0.49280959367752075 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4774126410484314, + "epoch": 1.51, + "learning_rate": 4.718230487461257e-05, + "loss": 0.5882, + "step": 1783, + "task_loss": 0.9920885562896729 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4701034426689148, + "epoch": 1.51, + "learning_rate": 4.717760871607026e-05, + "loss": 0.5893, + "step": 1784, + "task_loss": 1.4934524297714233 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6782662868499756, + "epoch": 1.51, + "learning_rate": 4.7172912557527945e-05, + "loss": 0.7713, + "step": 1785, + "task_loss": 0.595090925693512 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6420391798019409, + "epoch": 1.51, + "learning_rate": 4.716821639898563e-05, + "loss": 0.5801, + "step": 1786, + "task_loss": 0.9405713677406311 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.45343804359436035, + "epoch": 1.51, + "learning_rate": 4.7163520240443324e-05, + "loss": 0.58, + "step": 1787, + "task_loss": 0.3453715443611145 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.8332772254943848, + "epoch": 1.51, + "learning_rate": 4.715882408190101e-05, + "loss": 0.541, + "step": 1788, + "task_loss": 1.5292502641677856 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.555126965045929, + "epoch": 1.51, + "learning_rate": 4.715412792335869e-05, + "loss": 0.649, + "step": 1789, + "task_loss": 1.7272356748580933 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.48382294178009033, + "epoch": 1.51, + "learning_rate": 4.714943176481638e-05, + "loss": 0.4535, + "step": 1790, + "task_loss": 0.5396559834480286 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.48102712631225586, + "epoch": 1.51, + "learning_rate": 4.714473560627407e-05, + "loss": 0.5827, + "step": 1791, + "task_loss": 0.22900213301181793 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.739804208278656, + "epoch": 1.51, + "learning_rate": 4.714003944773176e-05, + "loss": 0.7315, + "step": 1792, + "task_loss": 1.0642415285110474 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5727558135986328, + "epoch": 1.52, + "learning_rate": 4.713534328918944e-05, + "loss": 0.6187, + "step": 1793, + "task_loss": 0.14176565408706665 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.31527066230773926, + "epoch": 1.52, + "learning_rate": 4.7130647130647135e-05, + "loss": 0.3815, + "step": 1794, + "task_loss": 0.413805216550827 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.8277374505996704, + "epoch": 1.52, + "learning_rate": 4.712595097210482e-05, + "loss": 0.7509, + "step": 1795, + "task_loss": 0.792826771736145 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.33134451508522034, + "epoch": 1.52, + "learning_rate": 4.712125481356251e-05, + "loss": 0.4612, + "step": 1796, + "task_loss": 0.24399790167808533 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3478565216064453, + "epoch": 1.52, + "learning_rate": 4.7116558655020194e-05, + "loss": 0.6013, + "step": 1797, + "task_loss": 0.6398569941520691 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5370184183120728, + "epoch": 1.52, + "learning_rate": 4.711186249647788e-05, + "loss": 0.6777, + "step": 1798, + "task_loss": 1.4945076704025269 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6733025312423706, + "epoch": 1.52, + "learning_rate": 4.7107166337935574e-05, + "loss": 0.6265, + "step": 1799, + "task_loss": 1.7343300580978394 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6706859469413757, + "epoch": 1.52, + "learning_rate": 4.710247017939326e-05, + "loss": 0.4879, + "step": 1800, + "task_loss": 0.7121043801307678 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6839126944541931, + "epoch": 1.52, + "learning_rate": 4.7097774020850946e-05, + "loss": 0.5784, + "step": 1801, + "task_loss": 0.39130473136901855 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7422491312026978, + "epoch": 1.52, + "learning_rate": 4.709307786230863e-05, + "loss": 0.5647, + "step": 1802, + "task_loss": 0.44851943850517273 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5224934816360474, + "epoch": 1.52, + "learning_rate": 4.708838170376632e-05, + "loss": 0.6259, + "step": 1803, + "task_loss": 1.4766483306884766 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.521412193775177, + "epoch": 1.52, + "learning_rate": 4.708368554522401e-05, + "loss": 0.5853, + "step": 1804, + "task_loss": 0.2764165699481964 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.816644549369812, + "epoch": 1.53, + "learning_rate": 4.70789893866817e-05, + "loss": 0.6368, + "step": 1805, + "task_loss": 1.8837902545928955 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5346130728721619, + "epoch": 1.53, + "learning_rate": 4.7074293228139385e-05, + "loss": 0.5366, + "step": 1806, + "task_loss": 1.0963741540908813 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3711344599723816, + "epoch": 1.53, + "learning_rate": 4.706959706959707e-05, + "loss": 0.3806, + "step": 1807, + "task_loss": 0.7327045202255249 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7563214302062988, + "epoch": 1.53, + "learning_rate": 4.706490091105476e-05, + "loss": 0.5583, + "step": 1808, + "task_loss": 0.6566154956817627 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5014467239379883, + "epoch": 1.53, + "learning_rate": 4.706020475251245e-05, + "loss": 0.529, + "step": 1809, + "task_loss": 1.7174571752548218 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5377577543258667, + "epoch": 1.53, + "learning_rate": 4.705550859397013e-05, + "loss": 0.5551, + "step": 1810, + "task_loss": 0.3616334795951843 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5012502074241638, + "epoch": 1.53, + "learning_rate": 4.705081243542782e-05, + "loss": 0.6894, + "step": 1811, + "task_loss": 0.5955134034156799 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7653435468673706, + "epoch": 1.53, + "learning_rate": 4.704611627688551e-05, + "loss": 0.6885, + "step": 1812, + "task_loss": 1.3360198736190796 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5278595089912415, + "epoch": 1.53, + "learning_rate": 4.7041420118343196e-05, + "loss": 0.6421, + "step": 1813, + "task_loss": 0.40481236577033997 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5398977994918823, + "epoch": 1.53, + "learning_rate": 4.703672395980088e-05, + "loss": 0.7748, + "step": 1814, + "task_loss": 0.6062988638877869 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5858588218688965, + "epoch": 1.53, + "learning_rate": 4.703202780125857e-05, + "loss": 0.5981, + "step": 1815, + "task_loss": 1.0421767234802246 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4117792248725891, + "epoch": 1.53, + "learning_rate": 4.702733164271626e-05, + "loss": 0.5286, + "step": 1816, + "task_loss": 1.0585367679595947 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5052696466445923, + "epoch": 1.54, + "learning_rate": 4.702263548417395e-05, + "loss": 0.6296, + "step": 1817, + "task_loss": 0.6016008853912354 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6066255569458008, + "epoch": 1.54, + "learning_rate": 4.701793932563164e-05, + "loss": 0.5051, + "step": 1818, + "task_loss": 0.48053833842277527 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4595336616039276, + "epoch": 1.54, + "learning_rate": 4.701324316708932e-05, + "loss": 0.5761, + "step": 1819, + "task_loss": 0.9621044397354126 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5598829388618469, + "epoch": 1.54, + "learning_rate": 4.700854700854701e-05, + "loss": 0.666, + "step": 1820, + "task_loss": 0.7623969912528992 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7515313625335693, + "epoch": 1.54, + "learning_rate": 4.70038508500047e-05, + "loss": 0.5915, + "step": 1821, + "task_loss": 1.030352234840393 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6997864246368408, + "epoch": 1.54, + "learning_rate": 4.6999154691462387e-05, + "loss": 0.644, + "step": 1822, + "task_loss": 0.4258491098880768 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4800461530685425, + "epoch": 1.54, + "learning_rate": 4.699445853292007e-05, + "loss": 0.6049, + "step": 1823, + "task_loss": 0.7813599109649658 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3815319538116455, + "epoch": 1.54, + "learning_rate": 4.698976237437776e-05, + "loss": 0.6337, + "step": 1824, + "task_loss": 0.5089420080184937 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5270010232925415, + "epoch": 1.54, + "learning_rate": 4.698506621583545e-05, + "loss": 0.4836, + "step": 1825, + "task_loss": 0.5693044662475586 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5194756388664246, + "epoch": 1.54, + "learning_rate": 4.698037005729314e-05, + "loss": 0.48, + "step": 1826, + "task_loss": 1.119638442993164 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.24129916727542877, + "epoch": 1.54, + "learning_rate": 4.697567389875082e-05, + "loss": 0.3878, + "step": 1827, + "task_loss": 0.11435811221599579 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4901048243045807, + "epoch": 1.54, + "learning_rate": 4.697097774020851e-05, + "loss": 0.5264, + "step": 1828, + "task_loss": 1.39665949344635 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2881966829299927, + "epoch": 1.55, + "learning_rate": 4.69662815816662e-05, + "loss": 0.4473, + "step": 1829, + "task_loss": 0.7257423400878906 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5609315633773804, + "epoch": 1.55, + "learning_rate": 4.696158542312389e-05, + "loss": 0.5217, + "step": 1830, + "task_loss": 1.0994693040847778 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.38073375821113586, + "epoch": 1.55, + "learning_rate": 4.695688926458158e-05, + "loss": 0.6504, + "step": 1831, + "task_loss": 0.5290494561195374 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5119773149490356, + "epoch": 1.55, + "learning_rate": 4.6952193106039264e-05, + "loss": 0.6186, + "step": 1832, + "task_loss": 1.0422250032424927 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.38902145624160767, + "epoch": 1.55, + "learning_rate": 4.694749694749695e-05, + "loss": 0.4482, + "step": 1833, + "task_loss": 0.7616206407546997 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3862183690071106, + "epoch": 1.55, + "learning_rate": 4.6942800788954636e-05, + "loss": 0.4838, + "step": 1834, + "task_loss": 1.4551347494125366 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7636275291442871, + "epoch": 1.55, + "learning_rate": 4.693810463041233e-05, + "loss": 0.5794, + "step": 1835, + "task_loss": 0.5440858602523804 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.1165210008621216, + "epoch": 1.55, + "learning_rate": 4.693340847187001e-05, + "loss": 0.6582, + "step": 1836, + "task_loss": 1.4407297372817993 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.519279956817627, + "epoch": 1.55, + "learning_rate": 4.69287123133277e-05, + "loss": 0.6449, + "step": 1837, + "task_loss": 0.5835684537887573 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4269437789916992, + "epoch": 1.55, + "learning_rate": 4.692401615478539e-05, + "loss": 0.4845, + "step": 1838, + "task_loss": 0.7465857267379761 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4857889711856842, + "epoch": 1.55, + "learning_rate": 4.6919319996243075e-05, + "loss": 0.6668, + "step": 1839, + "task_loss": 0.5877649784088135 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3874586522579193, + "epoch": 1.56, + "learning_rate": 4.691462383770076e-05, + "loss": 0.5063, + "step": 1840, + "task_loss": 0.9883366227149963 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.8268128633499146, + "epoch": 1.56, + "learning_rate": 4.690992767915845e-05, + "loss": 0.605, + "step": 1841, + "task_loss": 0.9920138716697693 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.38981783390045166, + "epoch": 1.56, + "learning_rate": 4.690523152061614e-05, + "loss": 0.6329, + "step": 1842, + "task_loss": 1.0755709409713745 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5933724641799927, + "epoch": 1.56, + "learning_rate": 4.690053536207383e-05, + "loss": 0.6515, + "step": 1843, + "task_loss": 0.40639886260032654 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6007105112075806, + "epoch": 1.56, + "learning_rate": 4.689583920353151e-05, + "loss": 0.5033, + "step": 1844, + "task_loss": 0.22573482990264893 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5393925905227661, + "epoch": 1.56, + "learning_rate": 4.68911430449892e-05, + "loss": 0.5745, + "step": 1845, + "task_loss": 0.6461055874824524 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.43580031394958496, + "epoch": 1.56, + "learning_rate": 4.6886446886446886e-05, + "loss": 0.5406, + "step": 1846, + "task_loss": 0.7986985445022583 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5687268972396851, + "epoch": 1.56, + "learning_rate": 4.688175072790458e-05, + "loss": 0.4739, + "step": 1847, + "task_loss": 0.9790605902671814 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.8112084269523621, + "epoch": 1.56, + "learning_rate": 4.6877054569362265e-05, + "loss": 0.6229, + "step": 1848, + "task_loss": 0.8389408588409424 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4346604347229004, + "epoch": 1.56, + "learning_rate": 4.687235841081995e-05, + "loss": 0.4942, + "step": 1849, + "task_loss": 1.1833844184875488 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.546808123588562, + "epoch": 1.56, + "learning_rate": 4.686766225227764e-05, + "loss": 0.5042, + "step": 1850, + "task_loss": 0.5895398855209351 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.601993978023529, + "epoch": 1.56, + "learning_rate": 4.6862966093735324e-05, + "loss": 0.5649, + "step": 1851, + "task_loss": 0.5142083764076233 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.49122998118400574, + "epoch": 1.57, + "learning_rate": 4.685826993519302e-05, + "loss": 0.4124, + "step": 1852, + "task_loss": 0.7187750339508057 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6636379957199097, + "epoch": 1.57, + "learning_rate": 4.68535737766507e-05, + "loss": 0.6097, + "step": 1853, + "task_loss": 1.213657259941101 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5201648473739624, + "epoch": 1.57, + "learning_rate": 4.684887761810839e-05, + "loss": 0.6343, + "step": 1854, + "task_loss": 0.6964669823646545 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.525881826877594, + "epoch": 1.57, + "learning_rate": 4.6844181459566076e-05, + "loss": 0.614, + "step": 1855, + "task_loss": 0.6494321227073669 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3271472454071045, + "epoch": 1.57, + "learning_rate": 4.683948530102377e-05, + "loss": 0.6381, + "step": 1856, + "task_loss": 0.08460842072963715 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.474894255399704, + "epoch": 1.57, + "learning_rate": 4.683478914248145e-05, + "loss": 0.5441, + "step": 1857, + "task_loss": 0.33298078179359436 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4968753457069397, + "epoch": 1.57, + "learning_rate": 4.683009298393914e-05, + "loss": 0.4592, + "step": 1858, + "task_loss": 0.5779150724411011 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.0260851383209229, + "epoch": 1.57, + "learning_rate": 4.682539682539683e-05, + "loss": 0.7933, + "step": 1859, + "task_loss": 1.6627287864685059 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7906615138053894, + "epoch": 1.57, + "learning_rate": 4.6820700666854515e-05, + "loss": 0.7098, + "step": 1860, + "task_loss": 2.013934373855591 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3578082323074341, + "epoch": 1.57, + "learning_rate": 4.681600450831221e-05, + "loss": 0.5458, + "step": 1861, + "task_loss": 0.18244178593158722 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6916027069091797, + "epoch": 1.57, + "learning_rate": 4.681130834976989e-05, + "loss": 0.6101, + "step": 1862, + "task_loss": 0.7993102073669434 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4502459764480591, + "epoch": 1.57, + "learning_rate": 4.680661219122758e-05, + "loss": 0.592, + "step": 1863, + "task_loss": 0.2994005084037781 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5625342726707458, + "epoch": 1.58, + "learning_rate": 4.680191603268527e-05, + "loss": 0.6211, + "step": 1864, + "task_loss": 1.1395446062088013 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6579430103302002, + "epoch": 1.58, + "learning_rate": 4.6797219874142953e-05, + "loss": 0.8096, + "step": 1865, + "task_loss": 1.5599775314331055 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3917092978954315, + "epoch": 1.58, + "learning_rate": 4.679252371560064e-05, + "loss": 0.5361, + "step": 1866, + "task_loss": 0.30034443736076355 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5206760168075562, + "epoch": 1.58, + "learning_rate": 4.6787827557058326e-05, + "loss": 0.7454, + "step": 1867, + "task_loss": 1.0147730112075806 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5398136377334595, + "epoch": 1.58, + "learning_rate": 4.678313139851602e-05, + "loss": 0.667, + "step": 1868, + "task_loss": 0.4181918799877167 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.42976731061935425, + "epoch": 1.58, + "learning_rate": 4.6778435239973706e-05, + "loss": 0.693, + "step": 1869, + "task_loss": 0.8976526260375977 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.45502638816833496, + "epoch": 1.58, + "learning_rate": 4.677373908143139e-05, + "loss": 0.4734, + "step": 1870, + "task_loss": 0.34420374035835266 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6993266344070435, + "epoch": 1.58, + "learning_rate": 4.676904292288908e-05, + "loss": 0.6215, + "step": 1871, + "task_loss": 1.2022216320037842 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5353761911392212, + "epoch": 1.58, + "learning_rate": 4.6764346764346765e-05, + "loss": 0.5871, + "step": 1872, + "task_loss": 0.3371061086654663 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.45115602016448975, + "epoch": 1.58, + "learning_rate": 4.675965060580446e-05, + "loss": 0.4945, + "step": 1873, + "task_loss": 0.5973798632621765 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.439235657453537, + "epoch": 1.58, + "learning_rate": 4.675495444726214e-05, + "loss": 0.6803, + "step": 1874, + "task_loss": 0.7925198078155518 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6705981492996216, + "epoch": 1.58, + "learning_rate": 4.675025828871983e-05, + "loss": 0.5552, + "step": 1875, + "task_loss": 0.7377611398696899 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3314593434333801, + "epoch": 1.59, + "learning_rate": 4.674556213017752e-05, + "loss": 0.5198, + "step": 1876, + "task_loss": 0.3796609342098236 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.25870436429977417, + "epoch": 1.59, + "learning_rate": 4.67408659716352e-05, + "loss": 0.4509, + "step": 1877, + "task_loss": 0.415881872177124 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3855830430984497, + "epoch": 1.59, + "learning_rate": 4.6736169813092896e-05, + "loss": 0.7109, + "step": 1878, + "task_loss": 0.44345542788505554 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.9690287113189697, + "epoch": 1.59, + "learning_rate": 4.6731473654550576e-05, + "loss": 0.6253, + "step": 1879, + "task_loss": 0.693649172782898 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.9196126461029053, + "epoch": 1.59, + "learning_rate": 4.672677749600827e-05, + "loss": 0.7093, + "step": 1880, + "task_loss": 1.3152531385421753 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7723612785339355, + "epoch": 1.59, + "learning_rate": 4.6722081337465955e-05, + "loss": 0.5864, + "step": 1881, + "task_loss": 0.8921085000038147 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4764910042285919, + "epoch": 1.59, + "learning_rate": 4.671738517892365e-05, + "loss": 0.4743, + "step": 1882, + "task_loss": 0.16368448734283447 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5067263841629028, + "epoch": 1.59, + "learning_rate": 4.671268902038133e-05, + "loss": 0.5572, + "step": 1883, + "task_loss": 0.7184029221534729 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3935500681400299, + "epoch": 1.59, + "learning_rate": 4.6707992861839014e-05, + "loss": 0.5763, + "step": 1884, + "task_loss": 0.6569693088531494 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4811840355396271, + "epoch": 1.59, + "learning_rate": 4.670329670329671e-05, + "loss": 0.6015, + "step": 1885, + "task_loss": 0.6897271871566772 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4812799394130707, + "epoch": 1.59, + "learning_rate": 4.6698600544754394e-05, + "loss": 0.6574, + "step": 1886, + "task_loss": 0.6717453598976135 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5927145481109619, + "epoch": 1.59, + "learning_rate": 4.669390438621208e-05, + "loss": 0.5827, + "step": 1887, + "task_loss": 0.7006586790084839 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6654424667358398, + "epoch": 1.6, + "learning_rate": 4.6689208227669766e-05, + "loss": 0.6126, + "step": 1888, + "task_loss": 1.265397071838379 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7272031307220459, + "epoch": 1.6, + "learning_rate": 4.668451206912746e-05, + "loss": 0.5902, + "step": 1889, + "task_loss": 0.7466819286346436 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.47081542015075684, + "epoch": 1.6, + "learning_rate": 4.6679815910585146e-05, + "loss": 0.7193, + "step": 1890, + "task_loss": 1.7209584712982178 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7862969636917114, + "epoch": 1.6, + "learning_rate": 4.6675119752042825e-05, + "loss": 0.5854, + "step": 1891, + "task_loss": 0.9786199331283569 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5097280144691467, + "epoch": 1.6, + "learning_rate": 4.667042359350052e-05, + "loss": 0.4389, + "step": 1892, + "task_loss": 0.30379530787467957 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6733946800231934, + "epoch": 1.6, + "learning_rate": 4.6665727434958205e-05, + "loss": 0.6607, + "step": 1893, + "task_loss": 1.7081656455993652 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.27515310049057007, + "epoch": 1.6, + "learning_rate": 4.66610312764159e-05, + "loss": 0.5711, + "step": 1894, + "task_loss": 0.34513160586357117 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5071406364440918, + "epoch": 1.6, + "learning_rate": 4.6656335117873584e-05, + "loss": 0.4801, + "step": 1895, + "task_loss": 1.2544324398040771 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.37362948060035706, + "epoch": 1.6, + "learning_rate": 4.665163895933127e-05, + "loss": 0.5747, + "step": 1896, + "task_loss": 0.9745263457298279 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.8012465834617615, + "epoch": 1.6, + "learning_rate": 4.664694280078896e-05, + "loss": 0.6202, + "step": 1897, + "task_loss": 1.1787583827972412 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2249249815940857, + "epoch": 1.6, + "learning_rate": 4.664224664224664e-05, + "loss": 0.4416, + "step": 1898, + "task_loss": 0.43031370639801025 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4262726902961731, + "epoch": 1.6, + "learning_rate": 4.6637550483704336e-05, + "loss": 0.4927, + "step": 1899, + "task_loss": 0.5710815787315369 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.47875678539276123, + "epoch": 1.61, + "learning_rate": 4.6632854325162016e-05, + "loss": 0.4549, + "step": 1900, + "task_loss": 0.4836062788963318 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5776896476745605, + "epoch": 1.61, + "learning_rate": 4.662815816661971e-05, + "loss": 0.5184, + "step": 1901, + "task_loss": 1.021346926689148 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2448888123035431, + "epoch": 1.61, + "learning_rate": 4.6623462008077395e-05, + "loss": 0.5209, + "step": 1902, + "task_loss": 0.2282993495464325 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5127319097518921, + "epoch": 1.61, + "learning_rate": 4.661876584953508e-05, + "loss": 0.7279, + "step": 1903, + "task_loss": 0.6702991127967834 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4277348518371582, + "epoch": 1.61, + "learning_rate": 4.661406969099277e-05, + "loss": 0.4922, + "step": 1904, + "task_loss": 0.6962489485740662 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.35797202587127686, + "epoch": 1.61, + "learning_rate": 4.6609373532450454e-05, + "loss": 0.5921, + "step": 1905, + "task_loss": 0.7855871915817261 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.59946209192276, + "epoch": 1.61, + "learning_rate": 4.660467737390815e-05, + "loss": 0.54, + "step": 1906, + "task_loss": 1.3011976480484009 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.9130920171737671, + "epoch": 1.61, + "learning_rate": 4.6599981215365834e-05, + "loss": 0.7217, + "step": 1907, + "task_loss": 0.9444011449813843 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6050752401351929, + "epoch": 1.61, + "learning_rate": 4.659528505682352e-05, + "loss": 0.5771, + "step": 1908, + "task_loss": 0.7212833166122437 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5063422918319702, + "epoch": 1.61, + "learning_rate": 4.6590588898281207e-05, + "loss": 0.8909, + "step": 1909, + "task_loss": 0.2792868912220001 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.38653329014778137, + "epoch": 1.61, + "learning_rate": 4.658589273973889e-05, + "loss": 0.4395, + "step": 1910, + "task_loss": 0.9186563491821289 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5764357447624207, + "epoch": 1.61, + "learning_rate": 4.6581196581196586e-05, + "loss": 0.4691, + "step": 1911, + "task_loss": 1.0853201150894165 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6206574440002441, + "epoch": 1.62, + "learning_rate": 4.657650042265427e-05, + "loss": 0.5765, + "step": 1912, + "task_loss": 1.0348291397094727 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.44312185049057007, + "epoch": 1.62, + "learning_rate": 4.657180426411196e-05, + "loss": 0.593, + "step": 1913, + "task_loss": 0.6225569844245911 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6333627700805664, + "epoch": 1.62, + "learning_rate": 4.6567108105569645e-05, + "loss": 0.6026, + "step": 1914, + "task_loss": 0.640951931476593 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5990137457847595, + "epoch": 1.62, + "learning_rate": 4.656241194702733e-05, + "loss": 0.5053, + "step": 1915, + "task_loss": 0.6408780813217163 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4648085832595825, + "epoch": 1.62, + "learning_rate": 4.6557715788485025e-05, + "loss": 0.5069, + "step": 1916, + "task_loss": 0.8815034627914429 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4059109091758728, + "epoch": 1.62, + "learning_rate": 4.6553019629942704e-05, + "loss": 0.5666, + "step": 1917, + "task_loss": 0.3956148326396942 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4059675931930542, + "epoch": 1.62, + "learning_rate": 4.65483234714004e-05, + "loss": 0.555, + "step": 1918, + "task_loss": 0.5022497177124023 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5159574747085571, + "epoch": 1.62, + "learning_rate": 4.6543627312858084e-05, + "loss": 0.5716, + "step": 1919, + "task_loss": 0.4647010862827301 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.740647554397583, + "epoch": 1.62, + "learning_rate": 4.653893115431578e-05, + "loss": 0.5379, + "step": 1920, + "task_loss": 0.8348552584648132 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7009769678115845, + "epoch": 1.62, + "learning_rate": 4.6534234995773456e-05, + "loss": 0.5971, + "step": 1921, + "task_loss": 1.73435640335083 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5020410418510437, + "epoch": 1.62, + "learning_rate": 4.652953883723114e-05, + "loss": 0.4729, + "step": 1922, + "task_loss": 0.7718576788902283 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.36021140217781067, + "epoch": 1.63, + "learning_rate": 4.6524842678688836e-05, + "loss": 0.5819, + "step": 1923, + "task_loss": 0.30075839161872864 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3421124815940857, + "epoch": 1.63, + "learning_rate": 4.652014652014652e-05, + "loss": 0.3706, + "step": 1924, + "task_loss": 0.6893089413642883 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.42204713821411133, + "epoch": 1.63, + "learning_rate": 4.6515450361604215e-05, + "loss": 0.5015, + "step": 1925, + "task_loss": 1.145038366317749 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5465960502624512, + "epoch": 1.63, + "learning_rate": 4.6510754203061895e-05, + "loss": 0.6289, + "step": 1926, + "task_loss": 0.8248675465583801 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.35700979828834534, + "epoch": 1.63, + "learning_rate": 4.650605804451959e-05, + "loss": 0.4823, + "step": 1927, + "task_loss": 0.4545920789241791 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4961574673652649, + "epoch": 1.63, + "learning_rate": 4.6501361885977274e-05, + "loss": 0.7004, + "step": 1928, + "task_loss": 1.551714301109314 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.42916354537010193, + "epoch": 1.63, + "learning_rate": 4.649666572743496e-05, + "loss": 0.6647, + "step": 1929, + "task_loss": 0.3034524619579315 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6383439302444458, + "epoch": 1.63, + "learning_rate": 4.649196956889265e-05, + "loss": 0.6976, + "step": 1930, + "task_loss": 0.6374261379241943 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5258362293243408, + "epoch": 1.63, + "learning_rate": 4.648727341035033e-05, + "loss": 0.6222, + "step": 1931, + "task_loss": 2.011550188064575 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.646300196647644, + "epoch": 1.63, + "learning_rate": 4.6482577251808026e-05, + "loss": 0.6566, + "step": 1932, + "task_loss": 0.49180835485458374 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6027108430862427, + "epoch": 1.63, + "learning_rate": 4.647788109326571e-05, + "loss": 0.5654, + "step": 1933, + "task_loss": 0.9724689722061157 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4280605912208557, + "epoch": 1.63, + "learning_rate": 4.64731849347234e-05, + "loss": 0.5269, + "step": 1934, + "task_loss": 0.2854591906070709 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.448786199092865, + "epoch": 1.64, + "learning_rate": 4.6468488776181085e-05, + "loss": 0.512, + "step": 1935, + "task_loss": 1.0266319513320923 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.764808177947998, + "epoch": 1.64, + "learning_rate": 4.646379261763877e-05, + "loss": 0.5364, + "step": 1936, + "task_loss": 0.5424794554710388 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5813546180725098, + "epoch": 1.64, + "learning_rate": 4.6459096459096465e-05, + "loss": 0.6073, + "step": 1937, + "task_loss": 0.4363210201263428 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.34104102849960327, + "epoch": 1.64, + "learning_rate": 4.645440030055415e-05, + "loss": 0.5936, + "step": 1938, + "task_loss": 0.6663821339607239 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.9777122139930725, + "epoch": 1.64, + "learning_rate": 4.644970414201184e-05, + "loss": 0.6662, + "step": 1939, + "task_loss": 1.4371451139450073 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4519270360469818, + "epoch": 1.64, + "learning_rate": 4.6445007983469524e-05, + "loss": 0.4886, + "step": 1940, + "task_loss": 0.6666457653045654 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.40716052055358887, + "epoch": 1.64, + "learning_rate": 4.644031182492721e-05, + "loss": 0.4942, + "step": 1941, + "task_loss": 0.7209877967834473 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.443619966506958, + "epoch": 1.64, + "learning_rate": 4.64356156663849e-05, + "loss": 0.6136, + "step": 1942, + "task_loss": 0.7305147647857666 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3493501543998718, + "epoch": 1.64, + "learning_rate": 4.643091950784258e-05, + "loss": 0.5634, + "step": 1943, + "task_loss": 0.884074866771698 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5071278810501099, + "epoch": 1.64, + "learning_rate": 4.6426223349300276e-05, + "loss": 0.5124, + "step": 1944, + "task_loss": 0.8684453964233398 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.367328941822052, + "epoch": 1.64, + "learning_rate": 4.642152719075796e-05, + "loss": 0.6858, + "step": 1945, + "task_loss": 0.4670770466327667 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6910357475280762, + "epoch": 1.64, + "learning_rate": 4.641683103221565e-05, + "loss": 0.6521, + "step": 1946, + "task_loss": 1.6614556312561035 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3436511158943176, + "epoch": 1.65, + "learning_rate": 4.6412134873673335e-05, + "loss": 0.5202, + "step": 1947, + "task_loss": 0.6699879169464111 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7080458402633667, + "epoch": 1.65, + "learning_rate": 4.640743871513102e-05, + "loss": 0.7517, + "step": 1948, + "task_loss": 1.3655600547790527 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7631797194480896, + "epoch": 1.65, + "learning_rate": 4.6402742556588714e-05, + "loss": 0.6153, + "step": 1949, + "task_loss": 1.3300447463989258 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.36314499378204346, + "epoch": 1.65, + "learning_rate": 4.63980463980464e-05, + "loss": 0.6187, + "step": 1950, + "task_loss": 0.6681150794029236 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.37765100598335266, + "epoch": 1.65, + "learning_rate": 4.639335023950409e-05, + "loss": 0.5002, + "step": 1951, + "task_loss": 1.1149048805236816 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3839151859283447, + "epoch": 1.65, + "learning_rate": 4.6388654080961773e-05, + "loss": 0.5777, + "step": 1952, + "task_loss": 0.4816952347755432 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5346459746360779, + "epoch": 1.65, + "learning_rate": 4.6383957922419467e-05, + "loss": 0.5784, + "step": 1953, + "task_loss": 0.37667304277420044 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5754196643829346, + "epoch": 1.65, + "learning_rate": 4.637926176387715e-05, + "loss": 0.6845, + "step": 1954, + "task_loss": 0.7497438788414001 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.46790409088134766, + "epoch": 1.65, + "learning_rate": 4.637456560533484e-05, + "loss": 0.699, + "step": 1955, + "task_loss": 1.074196457862854 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4146980345249176, + "epoch": 1.65, + "learning_rate": 4.6369869446792526e-05, + "loss": 0.4743, + "step": 1956, + "task_loss": 0.20992335677146912 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7312346696853638, + "epoch": 1.65, + "learning_rate": 4.636517328825021e-05, + "loss": 0.6351, + "step": 1957, + "task_loss": 0.7967596650123596 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5554275512695312, + "epoch": 1.65, + "learning_rate": 4.6360477129707905e-05, + "loss": 0.5177, + "step": 1958, + "task_loss": 0.567518949508667 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.42225921154022217, + "epoch": 1.66, + "learning_rate": 4.635578097116559e-05, + "loss": 0.4947, + "step": 1959, + "task_loss": 0.5452908277511597 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7208248376846313, + "epoch": 1.66, + "learning_rate": 4.635108481262328e-05, + "loss": 0.5922, + "step": 1960, + "task_loss": 0.41331160068511963 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.352708101272583, + "epoch": 1.66, + "learning_rate": 4.6346388654080964e-05, + "loss": 0.3642, + "step": 1961, + "task_loss": 0.5269802808761597 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2675037086009979, + "epoch": 1.66, + "learning_rate": 4.634169249553865e-05, + "loss": 0.4845, + "step": 1962, + "task_loss": 0.12757937610149384 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.711147129535675, + "epoch": 1.66, + "learning_rate": 4.6336996336996343e-05, + "loss": 0.4719, + "step": 1963, + "task_loss": 0.1511716991662979 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4818832278251648, + "epoch": 1.66, + "learning_rate": 4.633230017845402e-05, + "loss": 0.5088, + "step": 1964, + "task_loss": 0.7447141408920288 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.46119505167007446, + "epoch": 1.66, + "learning_rate": 4.6327604019911716e-05, + "loss": 0.4088, + "step": 1965, + "task_loss": 0.6016743183135986 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.49571090936660767, + "epoch": 1.66, + "learning_rate": 4.63229078613694e-05, + "loss": 0.5104, + "step": 1966, + "task_loss": 0.8246663212776184 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6447635889053345, + "epoch": 1.66, + "learning_rate": 4.631821170282709e-05, + "loss": 0.5409, + "step": 1967, + "task_loss": 1.0070780515670776 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4650660753250122, + "epoch": 1.66, + "learning_rate": 4.6313515544284775e-05, + "loss": 0.4314, + "step": 1968, + "task_loss": 0.7126833200454712 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5618286728858948, + "epoch": 1.66, + "learning_rate": 4.630881938574246e-05, + "loss": 0.3939, + "step": 1969, + "task_loss": 0.2841024696826935 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5228209495544434, + "epoch": 1.66, + "learning_rate": 4.6304123227200155e-05, + "loss": 0.6358, + "step": 1970, + "task_loss": 0.3963019549846649 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.32013314962387085, + "epoch": 1.67, + "learning_rate": 4.629942706865784e-05, + "loss": 0.5701, + "step": 1971, + "task_loss": 1.4695099592208862 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.48744869232177734, + "epoch": 1.67, + "learning_rate": 4.629473091011553e-05, + "loss": 0.512, + "step": 1972, + "task_loss": 0.9898286461830139 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6371152997016907, + "epoch": 1.67, + "learning_rate": 4.6290034751573214e-05, + "loss": 0.5826, + "step": 1973, + "task_loss": 0.8348256349563599 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5498768091201782, + "epoch": 1.67, + "learning_rate": 4.62853385930309e-05, + "loss": 0.5286, + "step": 1974, + "task_loss": 0.9081346392631531 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.8800171613693237, + "epoch": 1.67, + "learning_rate": 4.628064243448859e-05, + "loss": 0.4921, + "step": 1975, + "task_loss": 0.9750813245773315 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.40541839599609375, + "epoch": 1.67, + "learning_rate": 4.627594627594628e-05, + "loss": 0.5828, + "step": 1976, + "task_loss": 1.143312692642212 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.43513357639312744, + "epoch": 1.67, + "learning_rate": 4.6271250117403966e-05, + "loss": 0.584, + "step": 1977, + "task_loss": 1.2302149534225464 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.30312874913215637, + "epoch": 1.67, + "learning_rate": 4.626655395886165e-05, + "loss": 0.5572, + "step": 1978, + "task_loss": 0.867306113243103 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.584494948387146, + "epoch": 1.67, + "learning_rate": 4.626185780031934e-05, + "loss": 0.5211, + "step": 1979, + "task_loss": 1.49024498462677 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.39544039964675903, + "epoch": 1.67, + "learning_rate": 4.625716164177703e-05, + "loss": 0.4817, + "step": 1980, + "task_loss": 0.36360305547714233 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.559738039970398, + "epoch": 1.67, + "learning_rate": 4.625246548323471e-05, + "loss": 0.4825, + "step": 1981, + "task_loss": 0.8233970999717712 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.42535340785980225, + "epoch": 1.67, + "learning_rate": 4.6247769324692404e-05, + "loss": 0.4357, + "step": 1982, + "task_loss": 1.2819576263427734 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2912988066673279, + "epoch": 1.68, + "learning_rate": 4.624307316615009e-05, + "loss": 0.5389, + "step": 1983, + "task_loss": 0.53116375207901 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6343371272087097, + "epoch": 1.68, + "learning_rate": 4.6238377007607784e-05, + "loss": 0.5888, + "step": 1984, + "task_loss": 0.6254086494445801 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5136388540267944, + "epoch": 1.68, + "learning_rate": 4.623368084906547e-05, + "loss": 0.5854, + "step": 1985, + "task_loss": 0.2664932906627655 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.36403971910476685, + "epoch": 1.68, + "learning_rate": 4.622898469052315e-05, + "loss": 0.5995, + "step": 1986, + "task_loss": 0.702233076095581 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6623230576515198, + "epoch": 1.68, + "learning_rate": 4.622428853198084e-05, + "loss": 0.6526, + "step": 1987, + "task_loss": 1.7313801050186157 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7548041343688965, + "epoch": 1.68, + "learning_rate": 4.621959237343853e-05, + "loss": 0.647, + "step": 1988, + "task_loss": 1.599843144416809 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7858583927154541, + "epoch": 1.68, + "learning_rate": 4.621489621489622e-05, + "loss": 0.5625, + "step": 1989, + "task_loss": 0.6673519611358643 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6496902108192444, + "epoch": 1.68, + "learning_rate": 4.62102000563539e-05, + "loss": 0.6132, + "step": 1990, + "task_loss": 0.9289672374725342 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.24163088202476501, + "epoch": 1.68, + "learning_rate": 4.6205503897811595e-05, + "loss": 0.4131, + "step": 1991, + "task_loss": 0.7417337894439697 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.39544031023979187, + "epoch": 1.68, + "learning_rate": 4.620080773926928e-05, + "loss": 0.5012, + "step": 1992, + "task_loss": 0.4768454432487488 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.24969041347503662, + "epoch": 1.68, + "learning_rate": 4.619611158072697e-05, + "loss": 0.4169, + "step": 1993, + "task_loss": 0.6415157914161682 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.44749224185943604, + "epoch": 1.69, + "learning_rate": 4.6191415422184654e-05, + "loss": 0.5288, + "step": 1994, + "task_loss": 1.1452547311782837 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.47349682450294495, + "epoch": 1.69, + "learning_rate": 4.618671926364234e-05, + "loss": 0.4517, + "step": 1995, + "task_loss": 0.12785297632217407 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5869649648666382, + "epoch": 1.69, + "learning_rate": 4.618202310510003e-05, + "loss": 0.5492, + "step": 1996, + "task_loss": 0.4930233359336853 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4729825258255005, + "epoch": 1.69, + "learning_rate": 4.617732694655772e-05, + "loss": 0.6533, + "step": 1997, + "task_loss": 0.737528920173645 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.704756498336792, + "epoch": 1.69, + "learning_rate": 4.6172630788015406e-05, + "loss": 0.6278, + "step": 1998, + "task_loss": 1.0836979150772095 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.39312201738357544, + "epoch": 1.69, + "learning_rate": 4.616793462947309e-05, + "loss": 0.4108, + "step": 1999, + "task_loss": 0.3622550070285797 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4303606450557709, + "epoch": 1.69, + "learning_rate": 4.616323847093078e-05, + "loss": 0.5272, + "step": 2000, + "task_loss": 0.18537604808807373 + }, + { + "epoch": 1.69, + "eval_accuracy": 0.904990099009901, + "eval_loss": 0.33097419142723083, + "eval_runtime": 227.7003, + "eval_samples_per_second": 110.891, + "eval_steps_per_second": 0.87, + "step": 2000 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6591658592224121, + "epoch": 1.69, + "learning_rate": 4.615854231238847e-05, + "loss": 0.4892, + "step": 2001, + "task_loss": 0.21459172666072845 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4866146147251129, + "epoch": 1.69, + "learning_rate": 4.615384615384616e-05, + "loss": 0.5009, + "step": 2002, + "task_loss": 0.181616872549057 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.47221171855926514, + "epoch": 1.69, + "learning_rate": 4.6149149995303844e-05, + "loss": 0.6112, + "step": 2003, + "task_loss": 0.7320463061332703 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5193889141082764, + "epoch": 1.69, + "learning_rate": 4.614445383676153e-05, + "loss": 0.4626, + "step": 2004, + "task_loss": 0.6524899005889893 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6947740912437439, + "epoch": 1.69, + "learning_rate": 4.613975767821922e-05, + "loss": 0.5746, + "step": 2005, + "task_loss": 0.37834760546684265 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.643571138381958, + "epoch": 1.7, + "learning_rate": 4.613506151967691e-05, + "loss": 0.7315, + "step": 2006, + "task_loss": 2.4295802116394043 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.324611634016037, + "epoch": 1.7, + "learning_rate": 4.613036536113459e-05, + "loss": 0.4947, + "step": 2007, + "task_loss": 0.06080076843500137 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.9815917015075684, + "epoch": 1.7, + "learning_rate": 4.612566920259228e-05, + "loss": 0.6428, + "step": 2008, + "task_loss": 1.59491765499115 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4104413390159607, + "epoch": 1.7, + "learning_rate": 4.612097304404997e-05, + "loss": 0.4605, + "step": 2009, + "task_loss": 0.21556295454502106 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.46001192927360535, + "epoch": 1.7, + "learning_rate": 4.6116276885507656e-05, + "loss": 0.6466, + "step": 2010, + "task_loss": 1.0571357011795044 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.462877482175827, + "epoch": 1.7, + "learning_rate": 4.611158072696534e-05, + "loss": 0.6265, + "step": 2011, + "task_loss": 0.43202754855155945 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.327553391456604, + "epoch": 1.7, + "learning_rate": 4.610688456842303e-05, + "loss": 0.366, + "step": 2012, + "task_loss": 0.4694058895111084 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5055482387542725, + "epoch": 1.7, + "learning_rate": 4.610218840988072e-05, + "loss": 0.4806, + "step": 2013, + "task_loss": 1.1611257791519165 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.45444104075431824, + "epoch": 1.7, + "learning_rate": 4.609749225133841e-05, + "loss": 0.4351, + "step": 2014, + "task_loss": 0.48207858204841614 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5387264490127563, + "epoch": 1.7, + "learning_rate": 4.6092796092796094e-05, + "loss": 0.5912, + "step": 2015, + "task_loss": 0.5317508578300476 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5355345010757446, + "epoch": 1.7, + "learning_rate": 4.608809993425378e-05, + "loss": 0.554, + "step": 2016, + "task_loss": 1.2317243814468384 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5639587044715881, + "epoch": 1.7, + "learning_rate": 4.608340377571147e-05, + "loss": 0.6746, + "step": 2017, + "task_loss": 2.0961287021636963 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4055578410625458, + "epoch": 1.71, + "learning_rate": 4.607870761716916e-05, + "loss": 0.5017, + "step": 2018, + "task_loss": 0.9728465676307678 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.46916595101356506, + "epoch": 1.71, + "learning_rate": 4.6074011458626846e-05, + "loss": 0.5937, + "step": 2019, + "task_loss": 0.7556138038635254 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5292918086051941, + "epoch": 1.71, + "learning_rate": 4.606931530008453e-05, + "loss": 0.5846, + "step": 2020, + "task_loss": 0.9846111536026001 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4615810215473175, + "epoch": 1.71, + "learning_rate": 4.606461914154222e-05, + "loss": 0.5004, + "step": 2021, + "task_loss": 0.62034010887146 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.62611985206604, + "epoch": 1.71, + "learning_rate": 4.605992298299991e-05, + "loss": 0.4975, + "step": 2022, + "task_loss": 0.2502720355987549 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.583138108253479, + "epoch": 1.71, + "learning_rate": 4.60552268244576e-05, + "loss": 0.4532, + "step": 2023, + "task_loss": 2.597763776779175 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6877407431602478, + "epoch": 1.71, + "learning_rate": 4.605053066591528e-05, + "loss": 0.6673, + "step": 2024, + "task_loss": 0.628801167011261 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7325685620307922, + "epoch": 1.71, + "learning_rate": 4.604583450737297e-05, + "loss": 0.7999, + "step": 2025, + "task_loss": 1.5668489933013916 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3492036759853363, + "epoch": 1.71, + "learning_rate": 4.604113834883066e-05, + "loss": 0.4446, + "step": 2026, + "task_loss": 0.6551200747489929 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6214760541915894, + "epoch": 1.71, + "learning_rate": 4.603644219028835e-05, + "loss": 0.5697, + "step": 2027, + "task_loss": 1.3091659545898438 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.45524364709854126, + "epoch": 1.71, + "learning_rate": 4.603174603174603e-05, + "loss": 0.5027, + "step": 2028, + "task_loss": 0.3164771795272827 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5005728006362915, + "epoch": 1.71, + "learning_rate": 4.602704987320372e-05, + "loss": 0.5163, + "step": 2029, + "task_loss": 0.3878321349620819 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.41635948419570923, + "epoch": 1.72, + "learning_rate": 4.602235371466141e-05, + "loss": 0.6506, + "step": 2030, + "task_loss": 0.7554118037223816 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5966565608978271, + "epoch": 1.72, + "learning_rate": 4.6017657556119096e-05, + "loss": 0.6486, + "step": 2031, + "task_loss": 0.6373807191848755 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5056440830230713, + "epoch": 1.72, + "learning_rate": 4.601296139757679e-05, + "loss": 0.4492, + "step": 2032, + "task_loss": 0.23589545488357544 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5700172185897827, + "epoch": 1.72, + "learning_rate": 4.600826523903447e-05, + "loss": 0.5041, + "step": 2033, + "task_loss": 1.4903006553649902 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.41189807653427124, + "epoch": 1.72, + "learning_rate": 4.600356908049216e-05, + "loss": 0.431, + "step": 2034, + "task_loss": 0.8087654709815979 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4895195960998535, + "epoch": 1.72, + "learning_rate": 4.599887292194985e-05, + "loss": 0.5135, + "step": 2035, + "task_loss": 0.581380307674408 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5028020739555359, + "epoch": 1.72, + "learning_rate": 4.5994176763407534e-05, + "loss": 0.5303, + "step": 2036, + "task_loss": 0.8911030292510986 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.494489848613739, + "epoch": 1.72, + "learning_rate": 4.598948060486522e-05, + "loss": 0.5289, + "step": 2037, + "task_loss": 1.2926629781723022 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.8407003879547119, + "epoch": 1.72, + "learning_rate": 4.598478444632291e-05, + "loss": 0.6085, + "step": 2038, + "task_loss": 1.1930837631225586 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4640425145626068, + "epoch": 1.72, + "learning_rate": 4.59800882877806e-05, + "loss": 0.6198, + "step": 2039, + "task_loss": 0.4546608328819275 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.511309802532196, + "epoch": 1.72, + "learning_rate": 4.5975392129238286e-05, + "loss": 0.5555, + "step": 2040, + "task_loss": 0.8773356676101685 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4771813154220581, + "epoch": 1.72, + "learning_rate": 4.597069597069597e-05, + "loss": 0.512, + "step": 2041, + "task_loss": 0.5802324414253235 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5402620434761047, + "epoch": 1.73, + "learning_rate": 4.596599981215366e-05, + "loss": 0.6719, + "step": 2042, + "task_loss": 0.3395357131958008 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3940516710281372, + "epoch": 1.73, + "learning_rate": 4.5961303653611346e-05, + "loss": 0.3756, + "step": 2043, + "task_loss": 0.8970284461975098 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.423417866230011, + "epoch": 1.73, + "learning_rate": 4.595660749506904e-05, + "loss": 0.7119, + "step": 2044, + "task_loss": 0.757098913192749 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.35032856464385986, + "epoch": 1.73, + "learning_rate": 4.595191133652672e-05, + "loss": 0.4584, + "step": 2045, + "task_loss": 1.466117024421692 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.25562119483947754, + "epoch": 1.73, + "learning_rate": 4.594721517798441e-05, + "loss": 0.5246, + "step": 2046, + "task_loss": 0.350289523601532 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3606404662132263, + "epoch": 1.73, + "learning_rate": 4.59425190194421e-05, + "loss": 0.4952, + "step": 2047, + "task_loss": 1.1569145917892456 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3920883536338806, + "epoch": 1.73, + "learning_rate": 4.593782286089979e-05, + "loss": 0.4757, + "step": 2048, + "task_loss": 0.47120994329452515 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.52159184217453, + "epoch": 1.73, + "learning_rate": 4.593312670235748e-05, + "loss": 0.4805, + "step": 2049, + "task_loss": 1.0600637197494507 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6532259583473206, + "epoch": 1.73, + "learning_rate": 4.592843054381516e-05, + "loss": 0.7079, + "step": 2050, + "task_loss": 0.5629146099090576 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5053825974464417, + "epoch": 1.73, + "learning_rate": 4.592373438527285e-05, + "loss": 0.5325, + "step": 2051, + "task_loss": 0.24431300163269043 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3835293650627136, + "epoch": 1.73, + "learning_rate": 4.5919038226730536e-05, + "loss": 0.605, + "step": 2052, + "task_loss": 0.8664827346801758 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4695228338241577, + "epoch": 1.73, + "learning_rate": 4.591434206818823e-05, + "loss": 0.4481, + "step": 2053, + "task_loss": 0.5434091687202454 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3437801003456116, + "epoch": 1.74, + "learning_rate": 4.590964590964591e-05, + "loss": 0.5702, + "step": 2054, + "task_loss": 0.17606034874916077 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.691542387008667, + "epoch": 1.74, + "learning_rate": 4.59049497511036e-05, + "loss": 0.5252, + "step": 2055, + "task_loss": 1.209094524383545 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5686816573143005, + "epoch": 1.74, + "learning_rate": 4.590025359256129e-05, + "loss": 0.5176, + "step": 2056, + "task_loss": 0.7480659484863281 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4564830958843231, + "epoch": 1.74, + "learning_rate": 4.5895557434018975e-05, + "loss": 0.5654, + "step": 2057, + "task_loss": 1.0726795196533203 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4620174765586853, + "epoch": 1.74, + "learning_rate": 4.589086127547666e-05, + "loss": 0.4045, + "step": 2058, + "task_loss": 0.7516773343086243 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5002691149711609, + "epoch": 1.74, + "learning_rate": 4.588616511693435e-05, + "loss": 0.4714, + "step": 2059, + "task_loss": 1.3765820264816284 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3602658808231354, + "epoch": 1.74, + "learning_rate": 4.588146895839204e-05, + "loss": 0.4749, + "step": 2060, + "task_loss": 0.10768648236989975 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.27802640199661255, + "epoch": 1.74, + "learning_rate": 4.587677279984973e-05, + "loss": 0.5232, + "step": 2061, + "task_loss": 0.1309250295162201 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6688908934593201, + "epoch": 1.74, + "learning_rate": 4.587207664130741e-05, + "loss": 0.5011, + "step": 2062, + "task_loss": 0.808330774307251 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4703649878501892, + "epoch": 1.74, + "learning_rate": 4.58673804827651e-05, + "loss": 0.3553, + "step": 2063, + "task_loss": 0.040938060730695724 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4743626117706299, + "epoch": 1.74, + "learning_rate": 4.5862684324222786e-05, + "loss": 0.4116, + "step": 2064, + "task_loss": 0.8179786205291748 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.675616979598999, + "epoch": 1.75, + "learning_rate": 4.585798816568048e-05, + "loss": 0.4769, + "step": 2065, + "task_loss": 0.4578036069869995 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.8712598085403442, + "epoch": 1.75, + "learning_rate": 4.5853292007138165e-05, + "loss": 0.7874, + "step": 2066, + "task_loss": 0.7406539916992188 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6521067023277283, + "epoch": 1.75, + "learning_rate": 4.584859584859585e-05, + "loss": 0.6392, + "step": 2067, + "task_loss": 1.0126603841781616 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 1.2566328048706055, + "epoch": 1.75, + "learning_rate": 4.584389969005354e-05, + "loss": 0.7131, + "step": 2068, + "task_loss": 1.0378272533416748 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.36832600831985474, + "epoch": 1.75, + "learning_rate": 4.5839203531511224e-05, + "loss": 0.4704, + "step": 2069, + "task_loss": 0.841235876083374 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5462263822555542, + "epoch": 1.75, + "learning_rate": 4.583450737296892e-05, + "loss": 0.6086, + "step": 2070, + "task_loss": 0.0827615037560463 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.22516904771327972, + "epoch": 1.75, + "learning_rate": 4.58298112144266e-05, + "loss": 0.433, + "step": 2071, + "task_loss": 0.16937336325645447 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6932924389839172, + "epoch": 1.75, + "learning_rate": 4.582511505588429e-05, + "loss": 0.6002, + "step": 2072, + "task_loss": 0.0861826241016388 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3899523615837097, + "epoch": 1.75, + "learning_rate": 4.5820418897341976e-05, + "loss": 0.6182, + "step": 2073, + "task_loss": 0.298385888338089 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.36478549242019653, + "epoch": 1.75, + "learning_rate": 4.581572273879966e-05, + "loss": 0.4347, + "step": 2074, + "task_loss": 0.39235949516296387 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.617392897605896, + "epoch": 1.75, + "learning_rate": 4.581102658025735e-05, + "loss": 0.4981, + "step": 2075, + "task_loss": 1.016727328300476 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4272981286048889, + "epoch": 1.75, + "learning_rate": 4.5806330421715035e-05, + "loss": 0.3981, + "step": 2076, + "task_loss": 0.13504400849342346 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2955678105354309, + "epoch": 1.76, + "learning_rate": 4.580163426317273e-05, + "loss": 0.3571, + "step": 2077, + "task_loss": 0.12647226452827454 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4585294723510742, + "epoch": 1.76, + "learning_rate": 4.5796938104630415e-05, + "loss": 0.4787, + "step": 2078, + "task_loss": 0.5914861559867859 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3602704107761383, + "epoch": 1.76, + "learning_rate": 4.579224194608811e-05, + "loss": 0.5521, + "step": 2079, + "task_loss": 0.1589622050523758 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.35574787855148315, + "epoch": 1.76, + "learning_rate": 4.578754578754579e-05, + "loss": 0.4896, + "step": 2080, + "task_loss": 0.8386611342430115 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5386511087417603, + "epoch": 1.76, + "learning_rate": 4.5782849629003474e-05, + "loss": 0.6548, + "step": 2081, + "task_loss": 0.7850254774093628 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7446581125259399, + "epoch": 1.76, + "learning_rate": 4.577815347046117e-05, + "loss": 0.6338, + "step": 2082, + "task_loss": 0.7763950824737549 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5751662850379944, + "epoch": 1.76, + "learning_rate": 4.577345731191885e-05, + "loss": 0.5322, + "step": 2083, + "task_loss": 2.5131635665893555 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.34754621982574463, + "epoch": 1.76, + "learning_rate": 4.576876115337654e-05, + "loss": 0.4211, + "step": 2084, + "task_loss": 0.1351654976606369 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.42446279525756836, + "epoch": 1.76, + "learning_rate": 4.5764064994834226e-05, + "loss": 0.5849, + "step": 2085, + "task_loss": 2.179248809814453 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5092846155166626, + "epoch": 1.76, + "learning_rate": 4.575936883629192e-05, + "loss": 0.5062, + "step": 2086, + "task_loss": 0.4836285710334778 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2724847197532654, + "epoch": 1.76, + "learning_rate": 4.5754672677749605e-05, + "loss": 0.3913, + "step": 2087, + "task_loss": 0.23720763623714447 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6237320899963379, + "epoch": 1.76, + "learning_rate": 4.5749976519207285e-05, + "loss": 0.5447, + "step": 2088, + "task_loss": 0.35276830196380615 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5783926844596863, + "epoch": 1.77, + "learning_rate": 4.574528036066498e-05, + "loss": 0.6048, + "step": 2089, + "task_loss": 1.3933990001678467 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4360746145248413, + "epoch": 1.77, + "learning_rate": 4.5740584202122664e-05, + "loss": 0.598, + "step": 2090, + "task_loss": 0.9829727411270142 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3990347683429718, + "epoch": 1.77, + "learning_rate": 4.573588804358036e-05, + "loss": 0.6707, + "step": 2091, + "task_loss": 0.6798000335693359 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2960702180862427, + "epoch": 1.77, + "learning_rate": 4.573119188503804e-05, + "loss": 0.5829, + "step": 2092, + "task_loss": 0.5920838713645935 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6754752993583679, + "epoch": 1.77, + "learning_rate": 4.572649572649573e-05, + "loss": 0.5883, + "step": 2093, + "task_loss": 0.9752802848815918 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5618507266044617, + "epoch": 1.77, + "learning_rate": 4.5721799567953417e-05, + "loss": 0.5188, + "step": 2094, + "task_loss": 0.8732621669769287 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5569792985916138, + "epoch": 1.77, + "learning_rate": 4.57171034094111e-05, + "loss": 0.5445, + "step": 2095, + "task_loss": 0.9643384218215942 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.31552353501319885, + "epoch": 1.77, + "learning_rate": 4.5712407250868796e-05, + "loss": 0.3805, + "step": 2096, + "task_loss": 1.038280725479126 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.583950400352478, + "epoch": 1.77, + "learning_rate": 4.5707711092326476e-05, + "loss": 0.5644, + "step": 2097, + "task_loss": 1.3596307039260864 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.23062486946582794, + "epoch": 1.77, + "learning_rate": 4.570301493378417e-05, + "loss": 0.4612, + "step": 2098, + "task_loss": 0.7762107849121094 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5040269494056702, + "epoch": 1.77, + "learning_rate": 4.5698318775241855e-05, + "loss": 0.615, + "step": 2099, + "task_loss": 0.8172342777252197 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.21802948415279388, + "epoch": 1.77, + "learning_rate": 4.569362261669954e-05, + "loss": 0.3645, + "step": 2100, + "task_loss": 0.4495164155960083 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4093010425567627, + "epoch": 1.78, + "learning_rate": 4.568892645815723e-05, + "loss": 0.4773, + "step": 2101, + "task_loss": 0.6140013337135315 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7371014356613159, + "epoch": 1.78, + "learning_rate": 4.5684230299614914e-05, + "loss": 0.5325, + "step": 2102, + "task_loss": 0.8337481021881104 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.38302743434906006, + "epoch": 1.78, + "learning_rate": 4.567953414107261e-05, + "loss": 0.4941, + "step": 2103, + "task_loss": 0.5175051689147949 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.36614078283309937, + "epoch": 1.78, + "learning_rate": 4.5674837982530294e-05, + "loss": 0.4696, + "step": 2104, + "task_loss": 0.8599974513053894 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7975735068321228, + "epoch": 1.78, + "learning_rate": 4.567014182398798e-05, + "loss": 0.6115, + "step": 2105, + "task_loss": 0.7631282806396484 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.39564090967178345, + "epoch": 1.78, + "learning_rate": 4.5665445665445666e-05, + "loss": 0.4693, + "step": 2106, + "task_loss": 1.1201039552688599 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6424553394317627, + "epoch": 1.78, + "learning_rate": 4.566074950690335e-05, + "loss": 0.7802, + "step": 2107, + "task_loss": 0.7485374212265015 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7212977409362793, + "epoch": 1.78, + "learning_rate": 4.5656053348361046e-05, + "loss": 0.548, + "step": 2108, + "task_loss": 0.9998968839645386 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6507986783981323, + "epoch": 1.78, + "learning_rate": 4.565135718981873e-05, + "loss": 0.5358, + "step": 2109, + "task_loss": 1.282192587852478 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4429655373096466, + "epoch": 1.78, + "learning_rate": 4.564666103127642e-05, + "loss": 0.5711, + "step": 2110, + "task_loss": 0.7206053137779236 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.39231905341148376, + "epoch": 1.78, + "learning_rate": 4.5641964872734105e-05, + "loss": 0.5264, + "step": 2111, + "task_loss": 0.39057064056396484 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4905829429626465, + "epoch": 1.78, + "learning_rate": 4.563726871419179e-05, + "loss": 0.6899, + "step": 2112, + "task_loss": 0.13349761068820953 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.49836465716362, + "epoch": 1.79, + "learning_rate": 4.5632572555649484e-05, + "loss": 0.5453, + "step": 2113, + "task_loss": 0.1886162906885147 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5580978989601135, + "epoch": 1.79, + "learning_rate": 4.5627876397107164e-05, + "loss": 0.5938, + "step": 2114, + "task_loss": 0.868122398853302 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6009644269943237, + "epoch": 1.79, + "learning_rate": 4.562318023856486e-05, + "loss": 0.5606, + "step": 2115, + "task_loss": 1.0359578132629395 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4018334150314331, + "epoch": 1.79, + "learning_rate": 4.561848408002254e-05, + "loss": 0.5318, + "step": 2116, + "task_loss": 0.8555917143821716 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.40596720576286316, + "epoch": 1.79, + "learning_rate": 4.5613787921480236e-05, + "loss": 0.571, + "step": 2117, + "task_loss": 0.30545079708099365 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6815071702003479, + "epoch": 1.79, + "learning_rate": 4.5609091762937916e-05, + "loss": 0.4764, + "step": 2118, + "task_loss": 0.6880651116371155 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7121834754943848, + "epoch": 1.79, + "learning_rate": 4.56043956043956e-05, + "loss": 0.5259, + "step": 2119, + "task_loss": 1.4275411367416382 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.33292773365974426, + "epoch": 1.79, + "learning_rate": 4.5599699445853295e-05, + "loss": 0.7284, + "step": 2120, + "task_loss": 0.7352378964424133 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4035104513168335, + "epoch": 1.79, + "learning_rate": 4.559500328731098e-05, + "loss": 0.5288, + "step": 2121, + "task_loss": 1.265568494796753 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.47141703963279724, + "epoch": 1.79, + "learning_rate": 4.559030712876867e-05, + "loss": 0.5049, + "step": 2122, + "task_loss": 0.6432398557662964 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7463590502738953, + "epoch": 1.79, + "learning_rate": 4.5585610970226354e-05, + "loss": 0.7512, + "step": 2123, + "task_loss": 1.4116387367248535 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.571821928024292, + "epoch": 1.79, + "learning_rate": 4.558091481168405e-05, + "loss": 0.5925, + "step": 2124, + "task_loss": 1.356939435005188 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6260712146759033, + "epoch": 1.8, + "learning_rate": 4.5576218653141734e-05, + "loss": 0.6078, + "step": 2125, + "task_loss": 0.6571511030197144 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6212284564971924, + "epoch": 1.8, + "learning_rate": 4.557152249459942e-05, + "loss": 0.5109, + "step": 2126, + "task_loss": 0.9573767185211182 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5255595445632935, + "epoch": 1.8, + "learning_rate": 4.5566826336057106e-05, + "loss": 0.59, + "step": 2127, + "task_loss": 0.652902364730835 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4078429937362671, + "epoch": 1.8, + "learning_rate": 4.556213017751479e-05, + "loss": 0.5608, + "step": 2128, + "task_loss": 0.7663223743438721 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6555610299110413, + "epoch": 1.8, + "learning_rate": 4.5557434018972486e-05, + "loss": 0.5736, + "step": 2129, + "task_loss": 0.16959886252880096 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.25424739718437195, + "epoch": 1.8, + "learning_rate": 4.555273786043017e-05, + "loss": 0.533, + "step": 2130, + "task_loss": 0.2955516576766968 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6188502311706543, + "epoch": 1.8, + "learning_rate": 4.554804170188786e-05, + "loss": 0.499, + "step": 2131, + "task_loss": 0.5569431185722351 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5260052680969238, + "epoch": 1.8, + "learning_rate": 4.5543345543345545e-05, + "loss": 0.4241, + "step": 2132, + "task_loss": 1.1661746501922607 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3231273591518402, + "epoch": 1.8, + "learning_rate": 4.553864938480323e-05, + "loss": 0.3994, + "step": 2133, + "task_loss": 0.8574492931365967 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.8729653358459473, + "epoch": 1.8, + "learning_rate": 4.5533953226260924e-05, + "loss": 0.629, + "step": 2134, + "task_loss": 0.5370767116546631 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5918306112289429, + "epoch": 1.8, + "learning_rate": 4.5529257067718604e-05, + "loss": 0.5093, + "step": 2135, + "task_loss": 1.4300636053085327 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3197835385799408, + "epoch": 1.81, + "learning_rate": 4.55245609091763e-05, + "loss": 0.4224, + "step": 2136, + "task_loss": 0.42014288902282715 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5788506269454956, + "epoch": 1.81, + "learning_rate": 4.5519864750633983e-05, + "loss": 0.5234, + "step": 2137, + "task_loss": 1.5012747049331665 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5487416982650757, + "epoch": 1.81, + "learning_rate": 4.551516859209167e-05, + "loss": 0.5452, + "step": 2138, + "task_loss": 1.2507144212722778 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2663614749908447, + "epoch": 1.81, + "learning_rate": 4.551047243354936e-05, + "loss": 0.3141, + "step": 2139, + "task_loss": 0.38411852717399597 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6469610333442688, + "epoch": 1.81, + "learning_rate": 4.550577627500704e-05, + "loss": 0.5484, + "step": 2140, + "task_loss": 0.7316147685050964 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.33591869473457336, + "epoch": 1.81, + "learning_rate": 4.5501080116464736e-05, + "loss": 0.4701, + "step": 2141, + "task_loss": 0.15448403358459473 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.845852255821228, + "epoch": 1.81, + "learning_rate": 4.549638395792242e-05, + "loss": 0.774, + "step": 2142, + "task_loss": 0.9374898076057434 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5409667491912842, + "epoch": 1.81, + "learning_rate": 4.549168779938011e-05, + "loss": 0.6409, + "step": 2143, + "task_loss": 1.0946840047836304 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7807337045669556, + "epoch": 1.81, + "learning_rate": 4.5486991640837795e-05, + "loss": 0.5916, + "step": 2144, + "task_loss": 1.2808492183685303 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3663102686405182, + "epoch": 1.81, + "learning_rate": 4.548229548229548e-05, + "loss": 0.507, + "step": 2145, + "task_loss": 0.5666319727897644 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6047992706298828, + "epoch": 1.81, + "learning_rate": 4.5477599323753174e-05, + "loss": 0.4602, + "step": 2146, + "task_loss": 1.170518159866333 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4351532459259033, + "epoch": 1.81, + "learning_rate": 4.547290316521086e-05, + "loss": 0.5223, + "step": 2147, + "task_loss": 0.5255849361419678 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7757853865623474, + "epoch": 1.82, + "learning_rate": 4.546820700666855e-05, + "loss": 0.5872, + "step": 2148, + "task_loss": 0.361608624458313 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6012972593307495, + "epoch": 1.82, + "learning_rate": 4.546351084812623e-05, + "loss": 0.5811, + "step": 2149, + "task_loss": 0.5504059791564941 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.44565412402153015, + "epoch": 1.82, + "learning_rate": 4.5458814689583926e-05, + "loss": 0.5359, + "step": 2150, + "task_loss": 0.2065805345773697 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.589625358581543, + "epoch": 1.82, + "learning_rate": 4.545411853104161e-05, + "loss": 0.5279, + "step": 2151, + "task_loss": 0.6667364835739136 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4272102415561676, + "epoch": 1.82, + "learning_rate": 4.544942237249929e-05, + "loss": 0.4566, + "step": 2152, + "task_loss": 0.796841025352478 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5954587459564209, + "epoch": 1.82, + "learning_rate": 4.5444726213956985e-05, + "loss": 0.5288, + "step": 2153, + "task_loss": 0.7692040801048279 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7058021426200867, + "epoch": 1.82, + "learning_rate": 4.544003005541467e-05, + "loss": 0.625, + "step": 2154, + "task_loss": 1.066992163658142 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.42417773604393005, + "epoch": 1.82, + "learning_rate": 4.5435333896872365e-05, + "loss": 0.4271, + "step": 2155, + "task_loss": 0.7044788002967834 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.8196185827255249, + "epoch": 1.82, + "learning_rate": 4.543063773833005e-05, + "loss": 0.5676, + "step": 2156, + "task_loss": 1.4717459678649902 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.41547542810440063, + "epoch": 1.82, + "learning_rate": 4.542594157978774e-05, + "loss": 0.5298, + "step": 2157, + "task_loss": 0.7178290486335754 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.49613094329833984, + "epoch": 1.82, + "learning_rate": 4.5421245421245424e-05, + "loss": 0.4604, + "step": 2158, + "task_loss": 0.6860933303833008 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6807384490966797, + "epoch": 1.82, + "learning_rate": 4.541654926270311e-05, + "loss": 0.6239, + "step": 2159, + "task_loss": 1.0852456092834473 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.582312822341919, + "epoch": 1.83, + "learning_rate": 4.54118531041608e-05, + "loss": 0.6066, + "step": 2160, + "task_loss": 0.9413288831710815 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3024747967720032, + "epoch": 1.83, + "learning_rate": 4.540715694561848e-05, + "loss": 0.5687, + "step": 2161, + "task_loss": 0.24552661180496216 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7132899761199951, + "epoch": 1.83, + "learning_rate": 4.5402460787076176e-05, + "loss": 0.6837, + "step": 2162, + "task_loss": 0.6264849305152893 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.27583473920822144, + "epoch": 1.83, + "learning_rate": 4.539776462853386e-05, + "loss": 0.608, + "step": 2163, + "task_loss": 0.10471760481595993 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4249713718891144, + "epoch": 1.83, + "learning_rate": 4.539306846999155e-05, + "loss": 0.4802, + "step": 2164, + "task_loss": 0.48982134461402893 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5520041584968567, + "epoch": 1.83, + "learning_rate": 4.5388372311449235e-05, + "loss": 0.4561, + "step": 2165, + "task_loss": 0.7499051690101624 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.44273197650909424, + "epoch": 1.83, + "learning_rate": 4.538367615290692e-05, + "loss": 0.3812, + "step": 2166, + "task_loss": 0.7038940191268921 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.41414177417755127, + "epoch": 1.83, + "learning_rate": 4.5378979994364614e-05, + "loss": 0.5456, + "step": 2167, + "task_loss": 0.28725168108940125 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7032431960105896, + "epoch": 1.83, + "learning_rate": 4.53742838358223e-05, + "loss": 0.5515, + "step": 2168, + "task_loss": 0.826956570148468 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4881632328033447, + "epoch": 1.83, + "learning_rate": 4.536958767727999e-05, + "loss": 0.6441, + "step": 2169, + "task_loss": 0.23439858853816986 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5184561014175415, + "epoch": 1.83, + "learning_rate": 4.536489151873767e-05, + "loss": 0.4243, + "step": 2170, + "task_loss": 0.6797229051589966 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7807387113571167, + "epoch": 1.83, + "learning_rate": 4.536019536019536e-05, + "loss": 0.6407, + "step": 2171, + "task_loss": 1.397269368171692 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.620652973651886, + "epoch": 1.84, + "learning_rate": 4.535549920165305e-05, + "loss": 0.5552, + "step": 2172, + "task_loss": 1.65139639377594 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.829352617263794, + "epoch": 1.84, + "learning_rate": 4.535080304311074e-05, + "loss": 0.5462, + "step": 2173, + "task_loss": 0.4907941520214081 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.637789249420166, + "epoch": 1.84, + "learning_rate": 4.5346106884568425e-05, + "loss": 0.572, + "step": 2174, + "task_loss": 0.952836275100708 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.25070053339004517, + "epoch": 1.84, + "learning_rate": 4.534141072602611e-05, + "loss": 0.4687, + "step": 2175, + "task_loss": 0.17430594563484192 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.41071581840515137, + "epoch": 1.84, + "learning_rate": 4.53367145674838e-05, + "loss": 0.5044, + "step": 2176, + "task_loss": 1.1735378503799438 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3028821349143982, + "epoch": 1.84, + "learning_rate": 4.533201840894149e-05, + "loss": 0.4292, + "step": 2177, + "task_loss": 0.20326544344425201 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4037761092185974, + "epoch": 1.84, + "learning_rate": 4.532732225039917e-05, + "loss": 0.4835, + "step": 2178, + "task_loss": 0.7271296977996826 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.47745582461357117, + "epoch": 1.84, + "learning_rate": 4.5322626091856864e-05, + "loss": 0.497, + "step": 2179, + "task_loss": 0.9477565288543701 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3540424704551697, + "epoch": 1.84, + "learning_rate": 4.531792993331455e-05, + "loss": 0.4406, + "step": 2180, + "task_loss": 0.9853403568267822 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6484129428863525, + "epoch": 1.84, + "learning_rate": 4.531323377477224e-05, + "loss": 0.568, + "step": 2181, + "task_loss": 1.2559936046600342 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5226573944091797, + "epoch": 1.84, + "learning_rate": 4.530853761622992e-05, + "loss": 0.4311, + "step": 2182, + "task_loss": 0.2541341781616211 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3125724792480469, + "epoch": 1.84, + "learning_rate": 4.530384145768761e-05, + "loss": 0.474, + "step": 2183, + "task_loss": 1.44239342212677 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.578337550163269, + "epoch": 1.85, + "learning_rate": 4.52991452991453e-05, + "loss": 0.7536, + "step": 2184, + "task_loss": 1.3004363775253296 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6428928375244141, + "epoch": 1.85, + "learning_rate": 4.529444914060299e-05, + "loss": 0.5545, + "step": 2185, + "task_loss": 0.6410274505615234 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.655086100101471, + "epoch": 1.85, + "learning_rate": 4.528975298206068e-05, + "loss": 0.668, + "step": 2186, + "task_loss": 1.9022458791732788 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7058622241020203, + "epoch": 1.85, + "learning_rate": 4.528505682351836e-05, + "loss": 0.3764, + "step": 2187, + "task_loss": 0.5981324315071106 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5062519311904907, + "epoch": 1.85, + "learning_rate": 4.5280360664976054e-05, + "loss": 0.5433, + "step": 2188, + "task_loss": 0.43108922243118286 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2743852138519287, + "epoch": 1.85, + "learning_rate": 4.527566450643374e-05, + "loss": 0.2869, + "step": 2189, + "task_loss": 0.3801107406616211 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3619213104248047, + "epoch": 1.85, + "learning_rate": 4.527096834789143e-05, + "loss": 0.4097, + "step": 2190, + "task_loss": 0.5799493789672852 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5339503288269043, + "epoch": 1.85, + "learning_rate": 4.5266272189349114e-05, + "loss": 0.5173, + "step": 2191, + "task_loss": 0.615601122379303 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.35835206508636475, + "epoch": 1.85, + "learning_rate": 4.52615760308068e-05, + "loss": 0.4037, + "step": 2192, + "task_loss": 0.32787516713142395 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.41713377833366394, + "epoch": 1.85, + "learning_rate": 4.525687987226449e-05, + "loss": 0.5448, + "step": 2193, + "task_loss": 0.554107129573822 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5735433101654053, + "epoch": 1.85, + "learning_rate": 4.525218371372218e-05, + "loss": 0.4704, + "step": 2194, + "task_loss": 0.6607270240783691 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6074005365371704, + "epoch": 1.85, + "learning_rate": 4.5247487555179866e-05, + "loss": 0.4724, + "step": 2195, + "task_loss": 0.5390154719352722 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.30267736315727234, + "epoch": 1.86, + "learning_rate": 4.524279139663755e-05, + "loss": 0.5718, + "step": 2196, + "task_loss": 0.3005446195602417 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3553377687931061, + "epoch": 1.86, + "learning_rate": 4.523809523809524e-05, + "loss": 0.5129, + "step": 2197, + "task_loss": 0.5187085270881653 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.294472336769104, + "epoch": 1.86, + "learning_rate": 4.523339907955293e-05, + "loss": 0.5451, + "step": 2198, + "task_loss": 0.4742613136768341 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.563508927822113, + "epoch": 1.86, + "learning_rate": 4.522870292101061e-05, + "loss": 0.5543, + "step": 2199, + "task_loss": 0.7037122845649719 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.33230429887771606, + "epoch": 1.86, + "learning_rate": 4.5224006762468304e-05, + "loss": 0.6198, + "step": 2200, + "task_loss": 0.6052303314208984 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5442047119140625, + "epoch": 1.86, + "learning_rate": 4.521931060392599e-05, + "loss": 0.4875, + "step": 2201, + "task_loss": 0.5520533919334412 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.43952640891075134, + "epoch": 1.86, + "learning_rate": 4.521461444538368e-05, + "loss": 0.4513, + "step": 2202, + "task_loss": 0.3257262706756592 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3801150918006897, + "epoch": 1.86, + "learning_rate": 4.520991828684137e-05, + "loss": 0.4079, + "step": 2203, + "task_loss": 0.6574422121047974 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.42563432455062866, + "epoch": 1.86, + "learning_rate": 4.520522212829905e-05, + "loss": 0.4891, + "step": 2204, + "task_loss": 0.7676490545272827 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.49109360575675964, + "epoch": 1.86, + "learning_rate": 4.520052596975674e-05, + "loss": 0.428, + "step": 2205, + "task_loss": 0.5695843696594238 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3752708435058594, + "epoch": 1.86, + "learning_rate": 4.519582981121443e-05, + "loss": 0.5115, + "step": 2206, + "task_loss": 0.4675496518611908 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.48420971632003784, + "epoch": 1.87, + "learning_rate": 4.5191133652672115e-05, + "loss": 0.5828, + "step": 2207, + "task_loss": 0.9840088486671448 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4950655698776245, + "epoch": 1.87, + "learning_rate": 4.51864374941298e-05, + "loss": 0.5466, + "step": 2208, + "task_loss": 0.5158939957618713 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6614522933959961, + "epoch": 1.87, + "learning_rate": 4.518174133558749e-05, + "loss": 0.476, + "step": 2209, + "task_loss": 0.9019045233726501 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5384057760238647, + "epoch": 1.87, + "learning_rate": 4.517704517704518e-05, + "loss": 0.4236, + "step": 2210, + "task_loss": 0.8246455788612366 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.39109566807746887, + "epoch": 1.87, + "learning_rate": 4.517234901850287e-05, + "loss": 0.3618, + "step": 2211, + "task_loss": 0.6517609357833862 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.47159406542778015, + "epoch": 1.87, + "learning_rate": 4.5167652859960554e-05, + "loss": 0.4765, + "step": 2212, + "task_loss": 1.1871362924575806 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4874321222305298, + "epoch": 1.87, + "learning_rate": 4.516295670141824e-05, + "loss": 0.5, + "step": 2213, + "task_loss": 0.7124010920524597 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5585620403289795, + "epoch": 1.87, + "learning_rate": 4.5158260542875926e-05, + "loss": 0.5257, + "step": 2214, + "task_loss": 0.7095432877540588 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.9153774976730347, + "epoch": 1.87, + "learning_rate": 4.515356438433362e-05, + "loss": 0.6041, + "step": 2215, + "task_loss": 0.7734924554824829 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7772361636161804, + "epoch": 1.87, + "learning_rate": 4.5148868225791306e-05, + "loss": 0.4374, + "step": 2216, + "task_loss": 0.6268754005432129 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.49831143021583557, + "epoch": 1.87, + "learning_rate": 4.514417206724899e-05, + "loss": 0.4814, + "step": 2217, + "task_loss": 0.7295777797698975 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4912940263748169, + "epoch": 1.87, + "learning_rate": 4.513947590870668e-05, + "loss": 0.5651, + "step": 2218, + "task_loss": 1.9122928380966187 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4721025228500366, + "epoch": 1.88, + "learning_rate": 4.513477975016437e-05, + "loss": 0.4508, + "step": 2219, + "task_loss": 0.4070320725440979 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.24298636615276337, + "epoch": 1.88, + "learning_rate": 4.513008359162206e-05, + "loss": 0.5555, + "step": 2220, + "task_loss": 0.35527098178863525 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2279401570558548, + "epoch": 1.88, + "learning_rate": 4.5125387433079744e-05, + "loss": 0.5776, + "step": 2221, + "task_loss": 0.2882367968559265 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.407845139503479, + "epoch": 1.88, + "learning_rate": 4.512069127453743e-05, + "loss": 0.4195, + "step": 2222, + "task_loss": 0.43978166580200195 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5752991437911987, + "epoch": 1.88, + "learning_rate": 4.511599511599512e-05, + "loss": 0.4666, + "step": 2223, + "task_loss": 1.2600221633911133 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.413982093334198, + "epoch": 1.88, + "learning_rate": 4.511129895745281e-05, + "loss": 0.4144, + "step": 2224, + "task_loss": 0.7930886745452881 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.34477707743644714, + "epoch": 1.88, + "learning_rate": 4.510660279891049e-05, + "loss": 0.5246, + "step": 2225, + "task_loss": 0.8348634243011475 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.23894600570201874, + "epoch": 1.88, + "learning_rate": 4.510190664036818e-05, + "loss": 0.4209, + "step": 2226, + "task_loss": 0.11770555377006531 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3018686771392822, + "epoch": 1.88, + "learning_rate": 4.509721048182587e-05, + "loss": 0.5331, + "step": 2227, + "task_loss": 0.4910077452659607 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5459016561508179, + "epoch": 1.88, + "learning_rate": 4.5092514323283556e-05, + "loss": 0.4951, + "step": 2228, + "task_loss": 0.8900324702262878 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6447457075119019, + "epoch": 1.88, + "learning_rate": 4.508781816474124e-05, + "loss": 0.4969, + "step": 2229, + "task_loss": 0.48972901701927185 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6700356006622314, + "epoch": 1.88, + "learning_rate": 4.508312200619893e-05, + "loss": 0.6886, + "step": 2230, + "task_loss": 1.27056884765625 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4674208164215088, + "epoch": 1.89, + "learning_rate": 4.507842584765662e-05, + "loss": 0.6246, + "step": 2231, + "task_loss": 0.8575214743614197 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3030596673488617, + "epoch": 1.89, + "learning_rate": 4.507372968911431e-05, + "loss": 0.4507, + "step": 2232, + "task_loss": 0.27985879778862 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5640263557434082, + "epoch": 1.89, + "learning_rate": 4.5069033530571994e-05, + "loss": 0.5935, + "step": 2233, + "task_loss": 0.8245450854301453 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6157797574996948, + "epoch": 1.89, + "learning_rate": 4.506433737202968e-05, + "loss": 0.512, + "step": 2234, + "task_loss": 0.3441934287548065 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7360492944717407, + "epoch": 1.89, + "learning_rate": 4.505964121348737e-05, + "loss": 0.5686, + "step": 2235, + "task_loss": 0.9245166182518005 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5701268315315247, + "epoch": 1.89, + "learning_rate": 4.505494505494506e-05, + "loss": 0.3779, + "step": 2236, + "task_loss": 0.38714271783828735 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7124241590499878, + "epoch": 1.89, + "learning_rate": 4.5050248896402746e-05, + "loss": 0.625, + "step": 2237, + "task_loss": 0.9268093705177307 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5160531401634216, + "epoch": 1.89, + "learning_rate": 4.504555273786043e-05, + "loss": 0.4563, + "step": 2238, + "task_loss": 0.9240202903747559 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5146157145500183, + "epoch": 1.89, + "learning_rate": 4.504085657931812e-05, + "loss": 0.4871, + "step": 2239, + "task_loss": 0.4551108479499817 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.39063483476638794, + "epoch": 1.89, + "learning_rate": 4.5036160420775805e-05, + "loss": 0.488, + "step": 2240, + "task_loss": 0.5883594155311584 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.31814855337142944, + "epoch": 1.89, + "learning_rate": 4.50314642622335e-05, + "loss": 0.4786, + "step": 2241, + "task_loss": 0.3789138197898865 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.36893779039382935, + "epoch": 1.89, + "learning_rate": 4.502676810369118e-05, + "loss": 0.578, + "step": 2242, + "task_loss": 1.401842474937439 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.45042210817337036, + "epoch": 1.9, + "learning_rate": 4.502207194514887e-05, + "loss": 0.4621, + "step": 2243, + "task_loss": 0.882178008556366 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5337485074996948, + "epoch": 1.9, + "learning_rate": 4.501737578660656e-05, + "loss": 0.5552, + "step": 2244, + "task_loss": 0.5124814510345459 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.44250887632369995, + "epoch": 1.9, + "learning_rate": 4.501267962806425e-05, + "loss": 0.65, + "step": 2245, + "task_loss": 0.5449134111404419 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5950537919998169, + "epoch": 1.9, + "learning_rate": 4.500798346952193e-05, + "loss": 0.5676, + "step": 2246, + "task_loss": 0.7503431439399719 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7407461404800415, + "epoch": 1.9, + "learning_rate": 4.5003287310979616e-05, + "loss": 0.582, + "step": 2247, + "task_loss": 0.6328091025352478 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6619239449501038, + "epoch": 1.9, + "learning_rate": 4.499859115243731e-05, + "loss": 0.5603, + "step": 2248, + "task_loss": 1.067496418952942 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5679786205291748, + "epoch": 1.9, + "learning_rate": 4.4993894993894996e-05, + "loss": 0.5076, + "step": 2249, + "task_loss": 0.5655072927474976 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6375457644462585, + "epoch": 1.9, + "learning_rate": 4.498919883535269e-05, + "loss": 0.5349, + "step": 2250, + "task_loss": 1.417980432510376 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.48016980290412903, + "epoch": 1.9, + "learning_rate": 4.498450267681037e-05, + "loss": 0.4689, + "step": 2251, + "task_loss": 0.8580188751220703 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2589854896068573, + "epoch": 1.9, + "learning_rate": 4.497980651826806e-05, + "loss": 0.4222, + "step": 2252, + "task_loss": 0.2614496052265167 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5103530287742615, + "epoch": 1.9, + "learning_rate": 4.497511035972575e-05, + "loss": 0.4324, + "step": 2253, + "task_loss": 0.6431676745414734 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6913448572158813, + "epoch": 1.9, + "learning_rate": 4.4970414201183434e-05, + "loss": 0.4315, + "step": 2254, + "task_loss": 0.26369866728782654 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.45819130539894104, + "epoch": 1.91, + "learning_rate": 4.496571804264112e-05, + "loss": 0.3463, + "step": 2255, + "task_loss": 1.0589169263839722 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.353249192237854, + "epoch": 1.91, + "learning_rate": 4.496102188409881e-05, + "loss": 0.4943, + "step": 2256, + "task_loss": 0.5804387331008911 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.25524723529815674, + "epoch": 1.91, + "learning_rate": 4.49563257255565e-05, + "loss": 0.5528, + "step": 2257, + "task_loss": 0.035556524991989136 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.37007302045822144, + "epoch": 1.91, + "learning_rate": 4.4951629567014186e-05, + "loss": 0.2923, + "step": 2258, + "task_loss": 0.0676429346203804 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3159511089324951, + "epoch": 1.91, + "learning_rate": 4.494693340847187e-05, + "loss": 0.5108, + "step": 2259, + "task_loss": 0.8213542699813843 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4573473632335663, + "epoch": 1.91, + "learning_rate": 4.494223724992956e-05, + "loss": 0.4731, + "step": 2260, + "task_loss": 1.334798812866211 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5336796641349792, + "epoch": 1.91, + "learning_rate": 4.4937541091387245e-05, + "loss": 0.5235, + "step": 2261, + "task_loss": 0.8721821308135986 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.37542665004730225, + "epoch": 1.91, + "learning_rate": 4.493284493284494e-05, + "loss": 0.4349, + "step": 2262, + "task_loss": 0.6844853162765503 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.41831448674201965, + "epoch": 1.91, + "learning_rate": 4.4928148774302625e-05, + "loss": 0.3701, + "step": 2263, + "task_loss": 0.19918948411941528 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7168883085250854, + "epoch": 1.91, + "learning_rate": 4.492345261576031e-05, + "loss": 0.5545, + "step": 2264, + "task_loss": 0.4741404056549072 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.36223727464675903, + "epoch": 1.91, + "learning_rate": 4.4918756457218e-05, + "loss": 0.4288, + "step": 2265, + "task_loss": 0.4317978620529175 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.8133450746536255, + "epoch": 1.91, + "learning_rate": 4.4914060298675684e-05, + "loss": 0.641, + "step": 2266, + "task_loss": 1.952117681503296 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4052419364452362, + "epoch": 1.92, + "learning_rate": 4.490936414013338e-05, + "loss": 0.4599, + "step": 2267, + "task_loss": 0.324542760848999 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.38399559259414673, + "epoch": 1.92, + "learning_rate": 4.4904667981591057e-05, + "loss": 0.4148, + "step": 2268, + "task_loss": 0.5394772887229919 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3089216649532318, + "epoch": 1.92, + "learning_rate": 4.489997182304875e-05, + "loss": 0.4466, + "step": 2269, + "task_loss": 0.5239662528038025 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5696742534637451, + "epoch": 1.92, + "learning_rate": 4.4895275664506436e-05, + "loss": 0.491, + "step": 2270, + "task_loss": 1.0313329696655273 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.35590529441833496, + "epoch": 1.92, + "learning_rate": 4.489057950596412e-05, + "loss": 0.4242, + "step": 2271, + "task_loss": 0.49920719861984253 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.45868146419525146, + "epoch": 1.92, + "learning_rate": 4.488588334742181e-05, + "loss": 0.4737, + "step": 2272, + "task_loss": 0.5445338487625122 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.41158390045166016, + "epoch": 1.92, + "learning_rate": 4.4881187188879495e-05, + "loss": 0.5087, + "step": 2273, + "task_loss": 0.08774694055318832 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.46242597699165344, + "epoch": 1.92, + "learning_rate": 4.487649103033719e-05, + "loss": 0.5241, + "step": 2274, + "task_loss": 0.5694864988327026 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5519911646842957, + "epoch": 1.92, + "learning_rate": 4.4871794871794874e-05, + "loss": 0.4687, + "step": 2275, + "task_loss": 0.5505693554878235 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3028916120529175, + "epoch": 1.92, + "learning_rate": 4.486709871325256e-05, + "loss": 0.4878, + "step": 2276, + "task_loss": 0.4008864164352417 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6774001121520996, + "epoch": 1.92, + "learning_rate": 4.486240255471025e-05, + "loss": 0.6657, + "step": 2277, + "task_loss": 0.26391151547431946 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5171905159950256, + "epoch": 1.93, + "learning_rate": 4.4857706396167933e-05, + "loss": 0.5793, + "step": 2278, + "task_loss": 0.7273787260055542 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5339009165763855, + "epoch": 1.93, + "learning_rate": 4.4853010237625627e-05, + "loss": 0.5387, + "step": 2279, + "task_loss": 0.9003524780273438 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.553004801273346, + "epoch": 1.93, + "learning_rate": 4.484831407908331e-05, + "loss": 0.5062, + "step": 2280, + "task_loss": 0.8329215049743652 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5915158987045288, + "epoch": 1.93, + "learning_rate": 4.4843617920541e-05, + "loss": 0.5807, + "step": 2281, + "task_loss": 0.8880163431167603 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.34157514572143555, + "epoch": 1.93, + "learning_rate": 4.4838921761998686e-05, + "loss": 0.4766, + "step": 2282, + "task_loss": 0.7678619027137756 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.547350287437439, + "epoch": 1.93, + "learning_rate": 4.483422560345638e-05, + "loss": 0.5114, + "step": 2283, + "task_loss": 0.6258320212364197 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3251512944698334, + "epoch": 1.93, + "learning_rate": 4.4829529444914065e-05, + "loss": 0.4494, + "step": 2284, + "task_loss": 0.4617779552936554 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.42849624156951904, + "epoch": 1.93, + "learning_rate": 4.4824833286371745e-05, + "loss": 0.6358, + "step": 2285, + "task_loss": 0.7881009578704834 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4692254960536957, + "epoch": 1.93, + "learning_rate": 4.482013712782944e-05, + "loss": 0.4964, + "step": 2286, + "task_loss": 1.0737584829330444 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.744013786315918, + "epoch": 1.93, + "learning_rate": 4.4815440969287124e-05, + "loss": 0.6587, + "step": 2287, + "task_loss": 0.9637811183929443 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.49526044726371765, + "epoch": 1.93, + "learning_rate": 4.481074481074482e-05, + "loss": 0.5807, + "step": 2288, + "task_loss": 1.0202584266662598 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4795507788658142, + "epoch": 1.93, + "learning_rate": 4.48060486522025e-05, + "loss": 0.456, + "step": 2289, + "task_loss": 0.9396290183067322 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2889283001422882, + "epoch": 1.94, + "learning_rate": 4.480135249366019e-05, + "loss": 0.4967, + "step": 2290, + "task_loss": 0.5020221471786499 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7438539266586304, + "epoch": 1.94, + "learning_rate": 4.4796656335117876e-05, + "loss": 0.5163, + "step": 2291, + "task_loss": 2.0630650520324707 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4943634867668152, + "epoch": 1.94, + "learning_rate": 4.479196017657556e-05, + "loss": 0.517, + "step": 2292, + "task_loss": 1.1164608001708984 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4785299301147461, + "epoch": 1.94, + "learning_rate": 4.4787264018033256e-05, + "loss": 0.5257, + "step": 2293, + "task_loss": 0.6141465902328491 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3097531795501709, + "epoch": 1.94, + "learning_rate": 4.4782567859490935e-05, + "loss": 0.592, + "step": 2294, + "task_loss": 0.12399924546480179 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5815852880477905, + "epoch": 1.94, + "learning_rate": 4.477787170094863e-05, + "loss": 0.4104, + "step": 2295, + "task_loss": 0.48492249846458435 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5815432071685791, + "epoch": 1.94, + "learning_rate": 4.4773175542406315e-05, + "loss": 0.5207, + "step": 2296, + "task_loss": 1.0139658451080322 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4569403827190399, + "epoch": 1.94, + "learning_rate": 4.4768479383864e-05, + "loss": 0.5394, + "step": 2297, + "task_loss": 0.8580226898193359 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4122565984725952, + "epoch": 1.94, + "learning_rate": 4.476378322532169e-05, + "loss": 0.4653, + "step": 2298, + "task_loss": 0.3986000120639801 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.509259819984436, + "epoch": 1.94, + "learning_rate": 4.4759087066779374e-05, + "loss": 0.442, + "step": 2299, + "task_loss": 0.9061173796653748 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.42090505361557007, + "epoch": 1.94, + "learning_rate": 4.475439090823707e-05, + "loss": 0.4628, + "step": 2300, + "task_loss": 1.6994291543960571 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.19289448857307434, + "epoch": 1.94, + "learning_rate": 4.474969474969475e-05, + "loss": 0.4436, + "step": 2301, + "task_loss": 0.520093560218811 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4882197380065918, + "epoch": 1.95, + "learning_rate": 4.474499859115244e-05, + "loss": 0.6214, + "step": 2302, + "task_loss": 0.43309223651885986 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.33125039935112, + "epoch": 1.95, + "learning_rate": 4.4740302432610126e-05, + "loss": 0.4446, + "step": 2303, + "task_loss": 1.5631426572799683 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.46719789505004883, + "epoch": 1.95, + "learning_rate": 4.473560627406781e-05, + "loss": 0.4353, + "step": 2304, + "task_loss": 0.558542013168335 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7402175664901733, + "epoch": 1.95, + "learning_rate": 4.4730910115525505e-05, + "loss": 0.5682, + "step": 2305, + "task_loss": 0.7632941603660583 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4942198395729065, + "epoch": 1.95, + "learning_rate": 4.4726213956983185e-05, + "loss": 0.4506, + "step": 2306, + "task_loss": 1.0510542392730713 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5052782297134399, + "epoch": 1.95, + "learning_rate": 4.472151779844088e-05, + "loss": 0.398, + "step": 2307, + "task_loss": 0.4544996917247772 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6877444982528687, + "epoch": 1.95, + "learning_rate": 4.4716821639898564e-05, + "loss": 0.5308, + "step": 2308, + "task_loss": 0.6220576763153076 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.40043097734451294, + "epoch": 1.95, + "learning_rate": 4.471212548135625e-05, + "loss": 0.5133, + "step": 2309, + "task_loss": 0.7104654908180237 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6454329490661621, + "epoch": 1.95, + "learning_rate": 4.4707429322813944e-05, + "loss": 0.5804, + "step": 2310, + "task_loss": 0.6819896101951599 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4672989249229431, + "epoch": 1.95, + "learning_rate": 4.470273316427162e-05, + "loss": 0.4208, + "step": 2311, + "task_loss": 0.5673284530639648 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.880226731300354, + "epoch": 1.95, + "learning_rate": 4.4698037005729316e-05, + "loss": 0.5783, + "step": 2312, + "task_loss": 0.895263135433197 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5255374312400818, + "epoch": 1.95, + "learning_rate": 4.4693340847187e-05, + "loss": 0.578, + "step": 2313, + "task_loss": 1.2168922424316406 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.458711177110672, + "epoch": 1.96, + "learning_rate": 4.4688644688644696e-05, + "loss": 0.5142, + "step": 2314, + "task_loss": 1.1933882236480713 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.341196209192276, + "epoch": 1.96, + "learning_rate": 4.4683948530102375e-05, + "loss": 0.5817, + "step": 2315, + "task_loss": 0.5835264325141907 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6838213205337524, + "epoch": 1.96, + "learning_rate": 4.467925237156007e-05, + "loss": 0.5073, + "step": 2316, + "task_loss": 1.0636767148971558 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5800464153289795, + "epoch": 1.96, + "learning_rate": 4.4674556213017755e-05, + "loss": 0.5445, + "step": 2317, + "task_loss": 0.6344751715660095 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.37789446115493774, + "epoch": 1.96, + "learning_rate": 4.466986005447544e-05, + "loss": 0.4076, + "step": 2318, + "task_loss": 0.42768341302871704 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5368605256080627, + "epoch": 1.96, + "learning_rate": 4.466516389593313e-05, + "loss": 0.3775, + "step": 2319, + "task_loss": 0.4113626182079315 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4308410882949829, + "epoch": 1.96, + "learning_rate": 4.4660467737390814e-05, + "loss": 0.4753, + "step": 2320, + "task_loss": 0.5586192607879639 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3236071765422821, + "epoch": 1.96, + "learning_rate": 4.465577157884851e-05, + "loss": 0.3887, + "step": 2321, + "task_loss": 0.4282023310661316 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5250446796417236, + "epoch": 1.96, + "learning_rate": 4.4651075420306193e-05, + "loss": 0.4784, + "step": 2322, + "task_loss": 0.4545106887817383 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4591291844844818, + "epoch": 1.96, + "learning_rate": 4.464637926176388e-05, + "loss": 0.4527, + "step": 2323, + "task_loss": 0.3421345353126526 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.49141281843185425, + "epoch": 1.96, + "learning_rate": 4.4641683103221566e-05, + "loss": 0.5293, + "step": 2324, + "task_loss": 0.3139999806880951 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3978886008262634, + "epoch": 1.96, + "learning_rate": 4.463698694467925e-05, + "loss": 0.5748, + "step": 2325, + "task_loss": 0.779022753238678 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3961014747619629, + "epoch": 1.97, + "learning_rate": 4.4632290786136946e-05, + "loss": 0.5207, + "step": 2326, + "task_loss": 0.9287274479866028 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.31038859486579895, + "epoch": 1.97, + "learning_rate": 4.462759462759463e-05, + "loss": 0.3922, + "step": 2327, + "task_loss": 0.29638344049453735 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.502657413482666, + "epoch": 1.97, + "learning_rate": 4.462289846905232e-05, + "loss": 0.5031, + "step": 2328, + "task_loss": 0.51375812292099 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3382720649242401, + "epoch": 1.97, + "learning_rate": 4.4618202310510005e-05, + "loss": 0.484, + "step": 2329, + "task_loss": 0.39023569226264954 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.469430148601532, + "epoch": 1.97, + "learning_rate": 4.461350615196769e-05, + "loss": 0.4501, + "step": 2330, + "task_loss": 0.8305456638336182 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5457801222801208, + "epoch": 1.97, + "learning_rate": 4.4608809993425384e-05, + "loss": 0.5484, + "step": 2331, + "task_loss": 0.3412542939186096 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.38987088203430176, + "epoch": 1.97, + "learning_rate": 4.4604113834883064e-05, + "loss": 0.409, + "step": 2332, + "task_loss": 0.08229457587003708 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.35831141471862793, + "epoch": 1.97, + "learning_rate": 4.459941767634076e-05, + "loss": 0.4106, + "step": 2333, + "task_loss": 0.649703323841095 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.48139315843582153, + "epoch": 1.97, + "learning_rate": 4.459472151779844e-05, + "loss": 0.5709, + "step": 2334, + "task_loss": 0.9706647992134094 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7979201078414917, + "epoch": 1.97, + "learning_rate": 4.459002535925613e-05, + "loss": 0.6436, + "step": 2335, + "task_loss": 1.4212064743041992 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4594099223613739, + "epoch": 1.97, + "learning_rate": 4.4585329200713816e-05, + "loss": 0.6134, + "step": 2336, + "task_loss": 1.680701732635498 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3693271279335022, + "epoch": 1.97, + "learning_rate": 4.45806330421715e-05, + "loss": 0.4698, + "step": 2337, + "task_loss": 1.2624528408050537 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5230467319488525, + "epoch": 1.98, + "learning_rate": 4.4575936883629195e-05, + "loss": 0.4721, + "step": 2338, + "task_loss": 0.8766818046569824 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.46258798241615295, + "epoch": 1.98, + "learning_rate": 4.457124072508688e-05, + "loss": 0.5232, + "step": 2339, + "task_loss": 0.11453549563884735 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5493994355201721, + "epoch": 1.98, + "learning_rate": 4.4566544566544575e-05, + "loss": 0.4927, + "step": 2340, + "task_loss": 0.1464962512254715 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.49796533584594727, + "epoch": 1.98, + "learning_rate": 4.4561848408002254e-05, + "loss": 0.5328, + "step": 2341, + "task_loss": 0.43612194061279297 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2752041518688202, + "epoch": 1.98, + "learning_rate": 4.455715224945994e-05, + "loss": 0.3989, + "step": 2342, + "task_loss": 0.4040909707546234 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3940029740333557, + "epoch": 1.98, + "learning_rate": 4.4552456090917634e-05, + "loss": 0.3771, + "step": 2343, + "task_loss": 0.8706028461456299 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.36160773038864136, + "epoch": 1.98, + "learning_rate": 4.454775993237532e-05, + "loss": 0.4592, + "step": 2344, + "task_loss": 0.7673732042312622 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5324007272720337, + "epoch": 1.98, + "learning_rate": 4.4543063773833006e-05, + "loss": 0.6002, + "step": 2345, + "task_loss": 1.7328389883041382 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.22970181703567505, + "epoch": 1.98, + "learning_rate": 4.453836761529069e-05, + "loss": 0.428, + "step": 2346, + "task_loss": 0.2570614218711853 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.38340407609939575, + "epoch": 1.98, + "learning_rate": 4.4533671456748386e-05, + "loss": 0.5139, + "step": 2347, + "task_loss": 0.4300479590892792 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6223381161689758, + "epoch": 1.98, + "learning_rate": 4.452897529820607e-05, + "loss": 0.6749, + "step": 2348, + "task_loss": 1.5073533058166504 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2021184265613556, + "epoch": 1.99, + "learning_rate": 4.452427913966375e-05, + "loss": 0.5348, + "step": 2349, + "task_loss": 0.11629484593868256 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5701926350593567, + "epoch": 1.99, + "learning_rate": 4.4519582981121445e-05, + "loss": 0.5556, + "step": 2350, + "task_loss": 0.7032594084739685 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.5005585551261902, + "epoch": 1.99, + "learning_rate": 4.451488682257913e-05, + "loss": 0.4626, + "step": 2351, + "task_loss": 0.42340579628944397 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6794813275337219, + "epoch": 1.99, + "learning_rate": 4.4510190664036824e-05, + "loss": 0.4195, + "step": 2352, + "task_loss": 0.3115620017051697 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.629858136177063, + "epoch": 1.99, + "learning_rate": 4.4505494505494504e-05, + "loss": 0.6159, + "step": 2353, + "task_loss": 1.1281862258911133 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3484354317188263, + "epoch": 1.99, + "learning_rate": 4.45007983469522e-05, + "loss": 0.4341, + "step": 2354, + "task_loss": 0.15180468559265137 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6878166198730469, + "epoch": 1.99, + "learning_rate": 4.449610218840988e-05, + "loss": 0.4281, + "step": 2355, + "task_loss": 0.6516636610031128 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.50490802526474, + "epoch": 1.99, + "learning_rate": 4.449140602986757e-05, + "loss": 0.5107, + "step": 2356, + "task_loss": 0.8618326187133789 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2663092017173767, + "epoch": 1.99, + "learning_rate": 4.448670987132526e-05, + "loss": 0.4017, + "step": 2357, + "task_loss": 0.24696709215641022 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3918919265270233, + "epoch": 1.99, + "learning_rate": 4.448201371278294e-05, + "loss": 0.6027, + "step": 2358, + "task_loss": 0.18858911097049713 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6453561186790466, + "epoch": 1.99, + "learning_rate": 4.4477317554240635e-05, + "loss": 0.6237, + "step": 2359, + "task_loss": 0.9329038262367249 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2780349850654602, + "epoch": 1.99, + "learning_rate": 4.447262139569832e-05, + "loss": 0.3615, + "step": 2360, + "task_loss": 0.602092444896698 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2617350220680237, + "epoch": 2.0, + "learning_rate": 4.446792523715601e-05, + "loss": 0.4915, + "step": 2361, + "task_loss": 0.5249382257461548 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.4487283229827881, + "epoch": 2.0, + "learning_rate": 4.4463229078613694e-05, + "loss": 0.568, + "step": 2362, + "task_loss": 0.1316448599100113 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.3856232166290283, + "epoch": 2.0, + "learning_rate": 4.445853292007138e-05, + "loss": 0.4988, + "step": 2363, + "task_loss": 1.4342809915542603 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.2757651209831238, + "epoch": 2.0, + "learning_rate": 4.4453836761529074e-05, + "loss": 0.625, + "step": 2364, + "task_loss": 1.5079602003097534 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.7427771687507629, + "epoch": 2.0, + "learning_rate": 4.444914060298676e-05, + "loss": 0.5224, + "step": 2365, + "task_loss": 1.1306349039077759 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -Infinity, + "compression/movement_sparsity/linear_layer_sparsity": 0.0, + "compression/movement_sparsity/model_sparsity": 0.0, + "compression_loss": 0.0, + "distillation_loss": 0.6972264051437378, + "epoch": 2.0, + "learning_rate": 4.4444444444444447e-05, + "loss": 0.6178, + "step": 2366, + "task_loss": 1.0945106744766235 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0, + "compression/movement_sparsity/importance_threshold": -0.007003734819591045, + "compression/movement_sparsity/linear_layer_sparsity": 0.0010031444507022475, + "compression/movement_sparsity/model_sparsity": 0.0009686833527873334, + "compression_loss": 0.0, + "distillation_loss": 0.5802435874938965, + "epoch": 2.0, + "learning_rate": 4.443974828590213e-05, + "loss": 0.9462, + "step": 2367, + "task_loss": 1.607359528541565 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0008443568368990961, + "compression/movement_sparsity/importance_threshold": -0.006997821168212295, + "compression/movement_sparsity/linear_layer_sparsity": 0.0010031444507022475, + "compression/movement_sparsity/model_sparsity": 0.0009686833527873334, + "compression_loss": 0.09122283011674881, + "distillation_loss": 0.6684330105781555, + "epoch": 2.0, + "learning_rate": 4.443505212735982e-05, + "loss": 0.6762, + "step": 2368, + "task_loss": 0.776258111000061 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0016882382476638425, + "compression/movement_sparsity/importance_threshold": -0.006991910846592116, + "compression/movement_sparsity/linear_layer_sparsity": 0.0010031563748698835, + "compression/movement_sparsity/model_sparsity": 0.000968694867323128, + "compression_loss": 0.18239431083202362, + "distillation_loss": 0.5181126594543457, + "epoch": 2.0, + "learning_rate": 4.443035596881751e-05, + "loss": 0.641, + "step": 2369, + "task_loss": 0.5361073613166809 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.002531644366179364, + "compression/movement_sparsity/importance_threshold": -0.0069860038537928135, + "compression/movement_sparsity/linear_layer_sparsity": 0.0010092615486994453, + "compression/movement_sparsity/model_sparsity": 0.0009745903096499329, + "compression_loss": 0.2735142409801483, + "distillation_loss": 0.7323172092437744, + "epoch": 2.0, + "learning_rate": 4.44256598102752e-05, + "loss": 0.9388, + "step": 2370, + "task_loss": 0.7403607964515686 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0033745753263308975, + "compression/movement_sparsity/importance_threshold": -0.006980100188876689, + "compression/movement_sparsity/linear_layer_sparsity": 0.0010092496245318096, + "compression/movement_sparsity/model_sparsity": 0.0009745787951141384, + "compression_loss": 0.36458295583724976, + "distillation_loss": 0.4879717528820038, + "epoch": 2.0, + "learning_rate": 4.4420963651732885e-05, + "loss": 0.8105, + "step": 2371, + "task_loss": 0.7959870100021362 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.004217031262003457, + "compression/movement_sparsity/importance_threshold": -0.006974199850906047, + "compression/movement_sparsity/linear_layer_sparsity": 0.0010000918637874667, + "compression/movement_sparsity/model_sparsity": 0.000965735631623931, + "compression_loss": 0.4555998742580414, + "distillation_loss": 0.3729327917098999, + "epoch": 2.01, + "learning_rate": 4.441626749319057e-05, + "loss": 1.0359, + "step": 2372, + "task_loss": 1.5156570672988892 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.005059012307081834, + "compression/movement_sparsity/importance_threshold": -0.006968302838943197, + "compression/movement_sparsity/linear_layer_sparsity": 0.0010000918637874667, + "compression/movement_sparsity/model_sparsity": 0.000965735631623931, + "compression_loss": 0.546565055847168, + "distillation_loss": 0.4787403345108032, + "epoch": 2.01, + "learning_rate": 4.441157133464826e-05, + "loss": 1.0266, + "step": 2373, + "task_loss": 0.730780303478241 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.005900518595452042, + "compression/movement_sparsity/importance_threshold": -0.006962409152050433, + "compression/movement_sparsity/linear_layer_sparsity": 0.0010214599721909332, + "compression/movement_sparsity/model_sparsity": 0.0009863696797677481, + "compression_loss": 0.6374796032905579, + "distillation_loss": 0.27051985263824463, + "epoch": 2.01, + "learning_rate": 4.440687517610595e-05, + "loss": 1.1848, + "step": 2374, + "task_loss": 0.10961371660232544 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.006741550260998652, + "compression/movement_sparsity/importance_threshold": -0.006956518789290066, + "compression/movement_sparsity/linear_layer_sparsity": 0.001030617732935276, + "compression/movement_sparsity/model_sparsity": 0.0009952128432579555, + "compression_loss": 0.7283419966697693, + "distillation_loss": 0.4993123412132263, + "epoch": 2.01, + "learning_rate": 4.440217901756363e-05, + "loss": 1.1853, + "step": 2375, + "task_loss": 0.709650993347168 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.007582107437607011, + "compression/movement_sparsity/importance_threshold": -0.006950631749724397, + "compression/movement_sparsity/linear_layer_sparsity": 0.001033670319850057, + "compression/movement_sparsity/model_sparsity": 0.0009981605644213578, + "compression_loss": 0.8191527724266052, + "distillation_loss": 0.37680327892303467, + "epoch": 2.01, + "learning_rate": 4.4397482859021324e-05, + "loss": 1.2489, + "step": 2376, + "task_loss": 0.5057814717292786 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.008422190259162132, + "compression/movement_sparsity/importance_threshold": -0.006944748032415731, + "compression/movement_sparsity/linear_layer_sparsity": 0.0010489451785915974, + "compression/movement_sparsity/model_sparsity": 0.0010129106847741646, + "compression_loss": 0.9099125266075134, + "distillation_loss": 0.48812663555145264, + "epoch": 2.01, + "learning_rate": 4.439278670047901e-05, + "loss": 1.4283, + "step": 2377, + "task_loss": 1.0213834047317505 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.009261798859549142, + "compression/movement_sparsity/importance_threshold": -0.006938867636426372, + "compression/movement_sparsity/linear_layer_sparsity": 0.0010458925916768166, + "compression/movement_sparsity/model_sparsity": 0.0010099629636107622, + "compression_loss": 1.000620722770691, + "distillation_loss": 0.5056247115135193, + "epoch": 2.01, + "learning_rate": 4.43880905419367e-05, + "loss": 1.5263, + "step": 2378, + "task_loss": 0.05860638618469238 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.010100933372653165, + "compression/movement_sparsity/importance_threshold": -0.006932990560818625, + "compression/movement_sparsity/linear_layer_sparsity": 0.0010459045158444523, + "compression/movement_sparsity/model_sparsity": 0.0010099744781465567, + "compression_loss": 1.0912773609161377, + "distillation_loss": 0.375686913728714, + "epoch": 2.01, + "learning_rate": 4.438339438339438e-05, + "loss": 1.651, + "step": 2379, + "task_loss": 0.32050254940986633 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.010939593932359548, + "compression/movement_sparsity/importance_threshold": -0.006927116804654792, + "compression/movement_sparsity/linear_layer_sparsity": 0.0010550622765887951, + "compression/movement_sparsity/model_sparsity": 0.001018817641636764, + "compression_loss": 1.1818833351135254, + "distillation_loss": 0.3442913889884949, + "epoch": 2.01, + "learning_rate": 4.437869822485207e-05, + "loss": 1.5776, + "step": 2380, + "task_loss": 0.4802236258983612 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.011777780672553195, + "compression/movement_sparsity/importance_threshold": -0.006921246366997178, + "compression/movement_sparsity/linear_layer_sparsity": 0.001067272624247919, + "compression/movement_sparsity/model_sparsity": 0.0010306085262903738, + "compression_loss": 1.272438406944275, + "distillation_loss": 0.5577852725982666, + "epoch": 2.01, + "learning_rate": 4.437400206630976e-05, + "loss": 1.8478, + "step": 2381, + "task_loss": 1.0157655477523804 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.012615493727119342, + "compression/movement_sparsity/importance_threshold": -0.006915379246908087, + "compression/movement_sparsity/linear_layer_sparsity": 0.0010489690269268692, + "compression/movement_sparsity/model_sparsity": 0.0010129337138457537, + "compression_loss": 1.3629417419433594, + "distillation_loss": 0.3947383761405945, + "epoch": 2.01, + "learning_rate": 4.436930590776745e-05, + "loss": 1.8274, + "step": 2382, + "task_loss": 0.8813773989677429 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.013452733229942782, + "compression/movement_sparsity/importance_threshold": -0.006909515443449825, + "compression/movement_sparsity/linear_layer_sparsity": 0.001048980951094505, + "compression/movement_sparsity/model_sparsity": 0.0010129452283815484, + "compression_loss": 1.4533934593200684, + "distillation_loss": 0.5739043354988098, + "epoch": 2.01, + "learning_rate": 4.4364609749225135e-05, + "loss": 1.9145, + "step": 2383, + "task_loss": 0.46072179079055786 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.014289499314909304, + "compression/movement_sparsity/importance_threshold": -0.006903654955684692, + "compression/movement_sparsity/linear_layer_sparsity": 0.0010520454621769219, + "compression/movement_sparsity/model_sparsity": 0.0010159044640807452, + "compression_loss": 1.5437926054000854, + "distillation_loss": 0.7259434461593628, + "epoch": 2.02, + "learning_rate": 4.435991359068282e-05, + "loss": 2.0552, + "step": 2384, + "task_loss": 0.8381109237670898 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.015125792115903702, + "compression/movement_sparsity/importance_threshold": -0.006897797782674995, + "compression/movement_sparsity/linear_layer_sparsity": 0.0010550980490917027, + "compression/movement_sparsity/model_sparsity": 0.0010188521852441477, + "compression_loss": 1.6341404914855957, + "distillation_loss": 0.35940682888031006, + "epoch": 2.02, + "learning_rate": 4.4355217432140514e-05, + "loss": 2.1314, + "step": 2385, + "task_loss": 0.46927493810653687 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.015961611766811212, + "compression/movement_sparsity/importance_threshold": -0.006891943923483036, + "compression/movement_sparsity/linear_layer_sparsity": 0.0010596769294638742, + "compression/movement_sparsity/model_sparsity": 0.0010232737669892513, + "compression_loss": 1.7244375944137573, + "distillation_loss": 0.4708237051963806, + "epoch": 2.02, + "learning_rate": 4.43505212735982e-05, + "loss": 2.2354, + "step": 2386, + "task_loss": 0.7868325710296631 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.01679695840151696, + "compression/movement_sparsity/importance_threshold": -0.006886093377171118, + "compression/movement_sparsity/linear_layer_sparsity": 0.001062729516378655, + "compression/movement_sparsity/model_sparsity": 0.0010262214881526538, + "compression_loss": 1.8146816492080688, + "distillation_loss": 0.546553373336792, + "epoch": 2.02, + "learning_rate": 4.434582511505589e-05, + "loss": 2.252, + "step": 2387, + "task_loss": 0.6013540625572205 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.017631832153905957, + "compression/movement_sparsity/importance_threshold": -0.00688024614280155, + "compression/movement_sparsity/linear_layer_sparsity": 0.0010840976247821217, + "compression/movement_sparsity/model_sparsity": 0.001046855536296471, + "compression_loss": 1.9048746824264526, + "distillation_loss": 0.3215025067329407, + "epoch": 2.02, + "learning_rate": 4.434112895651357e-05, + "loss": 2.4363, + "step": 2388, + "task_loss": 0.3925120234489441 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.018466233157863443, + "compression/movement_sparsity/importance_threshold": -0.00687440221943663, + "compression/movement_sparsity/linear_layer_sparsity": 0.0011054895815208598, + "compression/movement_sparsity/model_sparsity": 0.0010675126135118772, + "compression_loss": 1.9950146675109863, + "distillation_loss": 0.38974276185035706, + "epoch": 2.02, + "learning_rate": 4.433643279797126e-05, + "loss": 2.5338, + "step": 2389, + "task_loss": 1.1412025690078735 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.01930016154727454, + "compression/movement_sparsity/importance_threshold": -0.006868561606138667, + "compression/movement_sparsity/linear_layer_sparsity": 0.0011085421684356408, + "compression/movement_sparsity/model_sparsity": 0.0010704603346752795, + "compression_loss": 2.085103750228882, + "distillation_loss": 0.42029839754104614, + "epoch": 2.02, + "learning_rate": 4.433173663942895e-05, + "loss": 2.6132, + "step": 2390, + "task_loss": 0.801195502281189 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.02013361745602449, + "compression/movement_sparsity/importance_threshold": -0.006862724301969961, + "compression/movement_sparsity/linear_layer_sparsity": 0.0011117855420325956, + "compression/movement_sparsity/model_sparsity": 0.0010735922884113948, + "compression_loss": 2.175140142440796, + "distillation_loss": 0.45288002490997314, + "epoch": 2.02, + "learning_rate": 4.432704048088664e-05, + "loss": 2.6946, + "step": 2391, + "task_loss": 0.30602532625198364 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.020966601017998188, + "compression/movement_sparsity/importance_threshold": -0.006856890305992819, + "compression/movement_sparsity/linear_layer_sparsity": 0.0011180815025443312, + "compression/movement_sparsity/model_sparsity": 0.001079671963310912, + "compression_loss": 2.265123128890991, + "distillation_loss": 0.2598922848701477, + "epoch": 2.02, + "learning_rate": 4.4322344322344325e-05, + "loss": 2.8098, + "step": 2392, + "task_loss": 0.5974996089935303 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.021799112367080653, + "compression/movement_sparsity/importance_threshold": -0.006851059617269545, + "compression/movement_sparsity/linear_layer_sparsity": 0.0011303037743710907, + "compression/movement_sparsity/model_sparsity": 0.0010914743625003165, + "compression_loss": 2.355052947998047, + "distillation_loss": 0.4024933874607086, + "epoch": 2.02, + "learning_rate": 4.431764816380201e-05, + "loss": 2.9442, + "step": 2393, + "task_loss": 0.13177450001239777 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.022631151637157676, + "compression/movement_sparsity/importance_threshold": -0.006845232234862439, + "compression/movement_sparsity/linear_layer_sparsity": 0.001154915256371512, + "compression/movement_sparsity/model_sparsity": 0.0011152403643802486, + "compression_loss": 2.4449310302734375, + "distillation_loss": 0.52522873878479, + "epoch": 2.02, + "learning_rate": 4.43129520052597e-05, + "loss": 3.0102, + "step": 2394, + "task_loss": 1.1190540790557861 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.023462718962113938, + "compression/movement_sparsity/importance_threshold": -0.006839408157833809, + "compression/movement_sparsity/linear_layer_sparsity": 0.0011705597643097644, + "compression/movement_sparsity/model_sparsity": 0.0011303474353426862, + "compression_loss": 2.5347583293914795, + "distillation_loss": 0.5930206775665283, + "epoch": 2.02, + "learning_rate": 4.430825584671739e-05, + "loss": 3.0029, + "step": 2395, + "task_loss": 1.391053318977356 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.024293814475834785, + "compression/movement_sparsity/importance_threshold": -0.006833587385245956, + "compression/movement_sparsity/linear_layer_sparsity": 0.0011829608986510618, + "compression/movement_sparsity/model_sparsity": 0.0011423225525690086, + "compression_loss": 2.6245336532592773, + "distillation_loss": 0.4650835394859314, + "epoch": 2.03, + "learning_rate": 4.430355968817507e-05, + "loss": 3.0461, + "step": 2396, + "task_loss": 0.8159640431404114 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.025124438312205122, + "compression/movement_sparsity/importance_threshold": -0.006827769916161187, + "compression/movement_sparsity/linear_layer_sparsity": 0.0011730399911780238, + "compression/movement_sparsity/model_sparsity": 0.0011327424587879507, + "compression_loss": 2.714254140853882, + "distillation_loss": 0.5791108012199402, + "epoch": 2.03, + "learning_rate": 4.4298863529632764e-05, + "loss": 3.2754, + "step": 2397, + "task_loss": 0.6679997444152832 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.025954590605110184, + "compression/movement_sparsity/importance_threshold": -0.006821955749641805, + "compression/movement_sparsity/linear_layer_sparsity": 0.0012121512610236546, + "compression/movement_sparsity/model_sparsity": 0.0011705101361940446, + "compression_loss": 2.803925037384033, + "distillation_loss": 0.4154004752635956, + "epoch": 2.03, + "learning_rate": 4.429416737109045e-05, + "loss": 3.2209, + "step": 2398, + "task_loss": 1.2762163877487183 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.026784271488435207, + "compression/movement_sparsity/importance_threshold": -0.006816144884750112, + "compression/movement_sparsity/linear_layer_sparsity": 0.0012337101561092948, + "compression/movement_sparsity/model_sparsity": 0.0011913284169105744, + "compression_loss": 2.8935439586639404, + "distillation_loss": 0.7611942887306213, + "epoch": 2.03, + "learning_rate": 4.4289471212548136e-05, + "loss": 3.3399, + "step": 2399, + "task_loss": 1.2041484117507935 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.027613481096065207, + "compression/movement_sparsity/importance_threshold": -0.006810337320548415, + "compression/movement_sparsity/linear_layer_sparsity": 0.0012401969033032042, + "compression/movement_sparsity/model_sparsity": 0.0011975923243828044, + "compression_loss": 2.983110189437866, + "distillation_loss": 0.548946738243103, + "epoch": 2.03, + "learning_rate": 4.428477505400582e-05, + "loss": 3.5766, + "step": 2400, + "task_loss": 1.5580856800079346 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.028442219561885307, + "compression/movement_sparsity/importance_threshold": -0.006804533056099016, + "compression/movement_sparsity/linear_layer_sparsity": 0.0012648203094712614, + "compression/movement_sparsity/model_sparsity": 0.0012213698407985312, + "compression_loss": 3.0726253986358643, + "distillation_loss": 0.3444945812225342, + "epoch": 2.03, + "learning_rate": 4.428007889546351e-05, + "loss": 3.5869, + "step": 2401, + "task_loss": 0.8993453979492188 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.029270487019780744, + "compression/movement_sparsity/importance_threshold": -0.0067987320904642195, + "compression/movement_sparsity/linear_layer_sparsity": 0.0012730241368047353, + "compression/movement_sparsity/model_sparsity": 0.0012292918414251754, + "compression_loss": 3.16208815574646, + "distillation_loss": 0.7511818408966064, + "epoch": 2.03, + "learning_rate": 4.42753827369212e-05, + "loss": 3.6665, + "step": 2402, + "task_loss": 0.9963592290878296 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03009828360363631, + "compression/movement_sparsity/importance_threshold": -0.006792934422706331, + "compression/movement_sparsity/linear_layer_sparsity": 0.0012846502002497016, + "compression/movement_sparsity/model_sparsity": 0.0012405185138248526, + "compression_loss": 3.251498222351074, + "distillation_loss": 0.2715146541595459, + "epoch": 2.03, + "learning_rate": 4.427068657837889e-05, + "loss": 3.6994, + "step": 2403, + "task_loss": 0.32863888144493103 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03092560944733791, + "compression/movement_sparsity/importance_threshold": -0.006787140051887651, + "compression/movement_sparsity/linear_layer_sparsity": 0.001308307748839254, + "compression/movement_sparsity/model_sparsity": 0.0012633633528412215, + "compression_loss": 3.3408586978912354, + "distillation_loss": 0.6217576265335083, + "epoch": 2.03, + "learning_rate": 4.4265990419836575e-05, + "loss": 3.8628, + "step": 2404, + "task_loss": 1.1299502849578857 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03175246468477011, + "compression/movement_sparsity/importance_threshold": -0.006781348977070486, + "compression/movement_sparsity/linear_layer_sparsity": 0.0013352205951933965, + "compression/movement_sparsity/model_sparsity": 0.0012893516601295, + "compression_loss": 3.4301633834838867, + "distillation_loss": 0.437610387802124, + "epoch": 2.03, + "learning_rate": 4.426129426129426e-05, + "loss": 3.8313, + "step": 2405, + "task_loss": 0.2818341851234436 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03257884944981815, + "compression/movement_sparsity/importance_threshold": -0.0067755611973171394, + "compression/movement_sparsity/linear_layer_sparsity": 0.001357733423689906, + "compression/movement_sparsity/model_sparsity": 0.0013110911037095931, + "compression_loss": 3.5194201469421387, + "distillation_loss": 0.5858157277107239, + "epoch": 2.03, + "learning_rate": 4.425659810275195e-05, + "loss": 3.9596, + "step": 2406, + "task_loss": 0.39481663703918457 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03340476387636726, + "compression/movement_sparsity/importance_threshold": -0.006769776711689915, + "compression/movement_sparsity/linear_layer_sparsity": 0.0013873172835944822, + "compression/movement_sparsity/model_sparsity": 0.0013396586670158489, + "compression_loss": 3.6086208820343018, + "distillation_loss": 0.17848718166351318, + "epoch": 2.03, + "learning_rate": 4.425190194420964e-05, + "loss": 4.0177, + "step": 2407, + "task_loss": 0.959834635257721 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03423020809830246, + "compression/movement_sparsity/importance_threshold": -0.006763995519251117, + "compression/movement_sparsity/linear_layer_sparsity": 0.0014109748321840343, + "compression/movement_sparsity/model_sparsity": 0.0013625035060322178, + "compression_loss": 3.6977672576904297, + "distillation_loss": 0.600239634513855, + "epoch": 2.04, + "learning_rate": 4.424720578566733e-05, + "loss": 4.1921, + "step": 2408, + "task_loss": 1.0775688886642456 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03505518224950899, + "compression/movement_sparsity/importance_threshold": -0.0067582176190630495, + "compression/movement_sparsity/linear_layer_sparsity": 0.0014487625194220843, + "compression/movement_sparsity/model_sparsity": 0.0013989930699651175, + "compression_loss": 3.78686261177063, + "distillation_loss": 0.6846114993095398, + "epoch": 2.04, + "learning_rate": 4.4242509627125013e-05, + "loss": 4.2744, + "step": 2409, + "task_loss": 0.30470189452171326 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03587968646387196, + "compression/movement_sparsity/importance_threshold": -0.006752443010188016, + "compression/movement_sparsity/linear_layer_sparsity": 0.0014741371481512007, + "compression/movement_sparsity/model_sparsity": 0.0014234960021359004, + "compression_loss": 3.8759052753448486, + "distillation_loss": 0.4664623439311981, + "epoch": 2.04, + "learning_rate": 4.42378134685827e-05, + "loss": 4.319, + "step": 2410, + "task_loss": 0.4815807342529297 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0367037208752764, + "compression/movement_sparsity/importance_threshold": -0.006746671691688321, + "compression/movement_sparsity/linear_layer_sparsity": 0.0015254587656559552, + "compression/movement_sparsity/model_sparsity": 0.001473054564195604, + "compression_loss": 3.9648962020874023, + "distillation_loss": 0.35947346687316895, + "epoch": 2.04, + "learning_rate": 4.423311731004039e-05, + "loss": 4.5273, + "step": 2411, + "task_loss": 1.190804123878479 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03752728561760765, + "compression/movement_sparsity/importance_threshold": -0.0067409036626262684, + "compression/movement_sparsity/linear_layer_sparsity": 0.0015901473750805119, + "compression/movement_sparsity/model_sparsity": 0.0015355209208809876, + "compression_loss": 4.053835391998291, + "distillation_loss": 0.731823742389679, + "epoch": 2.04, + "learning_rate": 4.422842115149808e-05, + "loss": 4.4758, + "step": 2412, + "task_loss": 0.2713839113712311 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.038350380824750396, + "compression/movement_sparsity/importance_threshold": -0.0067351389220641645, + "compression/movement_sparsity/linear_layer_sparsity": 0.0016405269833420332, + "compression/movement_sparsity/model_sparsity": 0.0015841698346129227, + "compression_loss": 4.1427226066589355, + "distillation_loss": 0.279991090297699, + "epoch": 2.04, + "learning_rate": 4.422372499295576e-05, + "loss": 4.6938, + "step": 2413, + "task_loss": 0.4196307957172394 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03917300663059042, + "compression/movement_sparsity/importance_threshold": -0.006729377469064308, + "compression/movement_sparsity/linear_layer_sparsity": 0.0016901315207072233, + "compression/movement_sparsity/model_sparsity": 0.0016320703035182123, + "compression_loss": 4.231555938720703, + "distillation_loss": 0.7195446491241455, + "epoch": 2.04, + "learning_rate": 4.421902883441345e-05, + "loss": 4.6808, + "step": 2414, + "task_loss": 0.7009245157241821 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.03999516316901264, + "compression/movement_sparsity/importance_threshold": -0.006723619302689006, + "compression/movement_sparsity/linear_layer_sparsity": 0.0017322953774676349, + "compression/movement_sparsity/model_sparsity": 0.0016727857020877085, + "compression_loss": 4.320333957672119, + "distillation_loss": 0.48594576120376587, + "epoch": 2.04, + "learning_rate": 4.421433267587114e-05, + "loss": 4.8165, + "step": 2415, + "task_loss": 0.350453644990921 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04081685057390205, + "compression/movement_sparsity/importance_threshold": -0.006717864422000563, + "compression/movement_sparsity/linear_layer_sparsity": 0.0017637751800263133, + "compression/movement_sparsity/model_sparsity": 0.0017031840765852962, + "compression_loss": 4.409062385559082, + "distillation_loss": 0.6417086720466614, + "epoch": 2.04, + "learning_rate": 4.420963651732883e-05, + "loss": 4.9643, + "step": 2416, + "task_loss": 0.700954020023346 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.041638068979143905, + "compression/movement_sparsity/importance_threshold": -0.006712112826061281, + "compression/movement_sparsity/linear_layer_sparsity": 0.0018170046643528057, + "compression/movement_sparsity/model_sparsity": 0.0017545849643721264, + "compression_loss": 4.497735500335693, + "distillation_loss": 0.441211998462677, + "epoch": 2.04, + "learning_rate": 4.420494035878652e-05, + "loss": 4.9931, + "step": 2417, + "task_loss": 0.8304980397224426 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04245881851862332, + "compression/movement_sparsity/importance_threshold": -0.006706364513933466, + "compression/movement_sparsity/linear_layer_sparsity": 0.0018530633472836555, + "compression/movement_sparsity/model_sparsity": 0.0017894049206148178, + "compression_loss": 4.586360931396484, + "distillation_loss": 0.527942419052124, + "epoch": 2.04, + "learning_rate": 4.4200244200244204e-05, + "loss": 5.2119, + "step": 2418, + "task_loss": 0.41304224729537964 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04327909932622542, + "compression/movement_sparsity/importance_threshold": -0.006700619484679421, + "compression/movement_sparsity/linear_layer_sparsity": 0.001922318912912748, + "compression/movement_sparsity/model_sparsity": 0.0018562813445095108, + "compression_loss": 4.674930095672607, + "distillation_loss": 0.3811076581478119, + "epoch": 2.04, + "learning_rate": 4.419554804170189e-05, + "loss": 5.1897, + "step": 2419, + "task_loss": 0.12234149873256683 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04409891153583545, + "compression/movement_sparsity/importance_threshold": -0.00669487773736145, + "compression/movement_sparsity/linear_layer_sparsity": 0.0019389054300942333, + "compression/movement_sparsity/model_sparsity": 0.001872298063799717, + "compression_loss": 4.763443946838379, + "distillation_loss": 0.5080851316452026, + "epoch": 2.05, + "learning_rate": 4.419085188315958e-05, + "loss": 5.2327, + "step": 2420, + "task_loss": 0.8528552651405334 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.044918255281338304, + "compression/movement_sparsity/importance_threshold": -0.006689139271041857, + "compression/movement_sparsity/linear_layer_sparsity": 0.0019370094874401312, + "compression/movement_sparsity/model_sparsity": 0.001870467252608385, + "compression_loss": 4.85191011428833, + "distillation_loss": 0.45705446600914, + "epoch": 2.05, + "learning_rate": 4.418615572461727e-05, + "loss": 5.3223, + "step": 2421, + "task_loss": 0.7068832516670227 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04573713069661933, + "compression/movement_sparsity/importance_threshold": -0.006683404084782946, + "compression/movement_sparsity/linear_layer_sparsity": 0.0019793760550503524, + "compression/movement_sparsity/model_sparsity": 0.0019113783982863883, + "compression_loss": 4.940328598022461, + "distillation_loss": 0.6961550712585449, + "epoch": 2.05, + "learning_rate": 4.418145956607495e-05, + "loss": 5.3905, + "step": 2422, + "task_loss": 1.6397305727005005 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04655553791556333, + "compression/movement_sparsity/importance_threshold": -0.006677672177647024, + "compression/movement_sparsity/linear_layer_sparsity": 0.0020410001533924926, + "compression/movement_sparsity/model_sparsity": 0.0019708855192725754, + "compression_loss": 5.0286865234375, + "distillation_loss": 0.5057326555252075, + "epoch": 2.05, + "learning_rate": 4.417676340753264e-05, + "loss": 5.4762, + "step": 2423, + "task_loss": 0.2978403568267822 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04737347707205597, + "compression/movement_sparsity/importance_threshold": -0.006671943548696389, + "compression/movement_sparsity/linear_layer_sparsity": 0.0021424986683089586, + "compression/movement_sparsity/model_sparsity": 0.0020688972479557064, + "compression_loss": 5.116994857788086, + "distillation_loss": 0.4109514653682709, + "epoch": 2.05, + "learning_rate": 4.417206724899033e-05, + "loss": 5.6792, + "step": 2424, + "task_loss": 0.4336155652999878 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04819094829998227, + "compression/movement_sparsity/importance_threshold": -0.006666218196993348, + "compression/movement_sparsity/linear_layer_sparsity": 0.00223674728930282, + "compression/movement_sparsity/model_sparsity": 0.0021599081388757567, + "compression_loss": 5.205246448516846, + "distillation_loss": 0.7307251691818237, + "epoch": 2.05, + "learning_rate": 4.4167371090448015e-05, + "loss": 5.8809, + "step": 2425, + "task_loss": 0.9712884426116943 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04900795173322714, + "compression/movement_sparsity/importance_threshold": -0.006660496121600205, + "compression/movement_sparsity/linear_layer_sparsity": 0.002262503491396284, + "compression/movement_sparsity/model_sparsity": 0.002184779536191965, + "compression_loss": 5.293447971343994, + "distillation_loss": 0.7137432098388672, + "epoch": 2.05, + "learning_rate": 4.41626749319057e-05, + "loss": 5.8125, + "step": 2426, + "task_loss": 0.23793554306030273 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.04982448750567581, + "compression/movement_sparsity/importance_threshold": -0.006654777321579265, + "compression/movement_sparsity/linear_layer_sparsity": 0.0022905729820111055, + "compression/movement_sparsity/model_sparsity": 0.002211884753452314, + "compression_loss": 5.381595134735107, + "distillation_loss": 0.4867357909679413, + "epoch": 2.05, + "learning_rate": 4.415797877336339e-05, + "loss": 5.7374, + "step": 2427, + "task_loss": 0.5562476515769958 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05064055575121351, + "compression/movement_sparsity/importance_threshold": -0.00664906179599283, + "compression/movement_sparsity/linear_layer_sparsity": 0.002354486520539331, + "compression/movement_sparsity/model_sparsity": 0.0022736026653110528, + "compression_loss": 5.469688415527344, + "distillation_loss": 0.44602248072624207, + "epoch": 2.05, + "learning_rate": 4.415328261482108e-05, + "loss": 5.8707, + "step": 2428, + "task_loss": 0.3602611720561981 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05145615660372527, + "compression/movement_sparsity/importance_threshold": -0.006643349543903205, + "compression/movement_sparsity/linear_layer_sparsity": 0.0023977950973927856, + "compression/movement_sparsity/model_sparsity": 0.002315423459316825, + "compression_loss": 5.557729721069336, + "distillation_loss": 0.4859420359134674, + "epoch": 2.05, + "learning_rate": 4.414858645627877e-05, + "loss": 6.0061, + "step": 2429, + "task_loss": 0.7623806595802307 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05227129019709631, + "compression/movement_sparsity/importance_threshold": -0.006637640564372694, + "compression/movement_sparsity/linear_layer_sparsity": 0.002436512869706433, + "compression/movement_sparsity/model_sparsity": 0.002352811157041699, + "compression_loss": 5.6457200050354, + "distillation_loss": 0.5332766175270081, + "epoch": 2.05, + "learning_rate": 4.4143890297736454e-05, + "loss": 6.1881, + "step": 2430, + "task_loss": 1.1421220302581787 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05308595666521165, + "compression/movement_sparsity/importance_threshold": -0.006631934856463601, + "compression/movement_sparsity/linear_layer_sparsity": 0.0025361154419687964, + "compression/movement_sparsity/model_sparsity": 0.002448992074533498, + "compression_loss": 5.73365592956543, + "distillation_loss": 0.47381317615509033, + "epoch": 2.05, + "learning_rate": 4.413919413919414e-05, + "loss": 6.2071, + "step": 2431, + "task_loss": 0.6038297414779663 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.053900156141956646, + "compression/movement_sparsity/importance_threshold": -0.006626232419238229, + "compression/movement_sparsity/linear_layer_sparsity": 0.002566068951070084, + "compression/movement_sparsity/model_sparsity": 0.0024779165884493844, + "compression_loss": 5.821540355682373, + "distillation_loss": 0.4869072437286377, + "epoch": 2.06, + "learning_rate": 4.4134497980651826e-05, + "loss": 6.3329, + "step": 2432, + "task_loss": 0.49263009428977966 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.054713888761215856, + "compression/movement_sparsity/importance_threshold": -0.006620533251758887, + "compression/movement_sparsity/linear_layer_sparsity": 0.0025939238066674604, + "compression/movement_sparsity/model_sparsity": 0.0025048145440654318, + "compression_loss": 5.909368991851807, + "distillation_loss": 0.4851858913898468, + "epoch": 2.06, + "learning_rate": 4.412980182210952e-05, + "loss": 6.3865, + "step": 2433, + "task_loss": 1.0558338165283203 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0555271546568753, + "compression/movement_sparsity/importance_threshold": -0.006614837353087871, + "compression/movement_sparsity/linear_layer_sparsity": 0.0026494665795153104, + "compression/movement_sparsity/model_sparsity": 0.0025584492517964027, + "compression_loss": 5.997147083282471, + "distillation_loss": 0.39389896392822266, + "epoch": 2.06, + "learning_rate": 4.4125105663567206e-05, + "loss": 6.4042, + "step": 2434, + "task_loss": 0.4758550822734833 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.056339953962819544, + "compression/movement_sparsity/importance_threshold": -0.006609144722287489, + "compression/movement_sparsity/linear_layer_sparsity": 0.0027419861962019714, + "compression/movement_sparsity/model_sparsity": 0.002647790535026245, + "compression_loss": 6.08487606048584, + "distillation_loss": 0.6093442440032959, + "epoch": 2.06, + "learning_rate": 4.412040950502489e-05, + "loss": 6.6246, + "step": 2435, + "task_loss": 1.1199067831039429 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05715228681293394, + "compression/movement_sparsity/importance_threshold": -0.006603455358420046, + "compression/movement_sparsity/linear_layer_sparsity": 0.0028184678074183967, + "compression/movement_sparsity/model_sparsity": 0.0027216447676124294, + "compression_loss": 6.172552108764648, + "distillation_loss": 0.7560376524925232, + "epoch": 2.06, + "learning_rate": 4.411571334648258e-05, + "loss": 6.6906, + "step": 2436, + "task_loss": 0.9296551942825317 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05796415334110361, + "compression/movement_sparsity/importance_threshold": -0.006597769260547844, + "compression/movement_sparsity/linear_layer_sparsity": 0.002855671210442289, + "compression/movement_sparsity/model_sparsity": 0.002757570119291397, + "compression_loss": 6.26017427444458, + "distillation_loss": 0.7523699998855591, + "epoch": 2.06, + "learning_rate": 4.4111017187940265e-05, + "loss": 6.8076, + "step": 2437, + "task_loss": 1.1962324380874634 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.058775553681213566, + "compression/movement_sparsity/importance_threshold": -0.006592086427733187, + "compression/movement_sparsity/linear_layer_sparsity": 0.002935420043590941, + "compression/movement_sparsity/model_sparsity": 0.002834579334685286, + "compression_loss": 6.347743511199951, + "distillation_loss": 0.5805315971374512, + "epoch": 2.06, + "learning_rate": 4.410632102939796e-05, + "loss": 6.8751, + "step": 2438, + "task_loss": 0.4322480857372284 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.05958648796714905, + "compression/movement_sparsity/importance_threshold": -0.006586406859038381, + "compression/movement_sparsity/linear_layer_sparsity": 0.0030592525244893787, + "compression/movement_sparsity/model_sparsity": 0.0029541577889115917, + "compression_loss": 6.435264587402344, + "distillation_loss": 0.42431050539016724, + "epoch": 2.06, + "learning_rate": 4.410162487085564e-05, + "loss": 6.9902, + "step": 2439, + "task_loss": 0.42653849720954895 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06039695633279518, + "compression/movement_sparsity/importance_threshold": -0.006580730553525728, + "compression/movement_sparsity/linear_layer_sparsity": 0.0031496854118397634, + "compression/movement_sparsity/model_sparsity": 0.003041484028377389, + "compression_loss": 6.522731304168701, + "distillation_loss": 0.5754430890083313, + "epoch": 2.06, + "learning_rate": 4.409692871231333e-05, + "loss": 7.0934, + "step": 2440, + "task_loss": 0.36451107263565063 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06120695891203709, + "compression/movement_sparsity/importance_threshold": -0.006575057510257533, + "compression/movement_sparsity/linear_layer_sparsity": 0.003232486831903196, + "compression/movement_sparsity/model_sparsity": 0.0031214409649346804, + "compression_loss": 6.610144138336182, + "distillation_loss": 0.2555873692035675, + "epoch": 2.06, + "learning_rate": 4.409223255377102e-05, + "loss": 6.9159, + "step": 2441, + "task_loss": 0.25501060485839844 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06201649583876001, + "compression/movement_sparsity/importance_threshold": -0.006569387728296099, + "compression/movement_sparsity/linear_layer_sparsity": 0.0033410444540600934, + "compression/movement_sparsity/model_sparsity": 0.00322626929880818, + "compression_loss": 6.6974992752075195, + "distillation_loss": 0.439372181892395, + "epoch": 2.06, + "learning_rate": 4.408753639522871e-05, + "loss": 7.1195, + "step": 2442, + "task_loss": 0.6739282608032227 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06282556724684862, + "compression/movement_sparsity/importance_threshold": -0.006563721206703733, + "compression/movement_sparsity/linear_layer_sparsity": 0.003441767898080228, + "compression/movement_sparsity/model_sparsity": 0.0033235325826646662, + "compression_loss": 6.784798622131348, + "distillation_loss": 0.631488561630249, + "epoch": 2.07, + "learning_rate": 4.408284023668639e-05, + "loss": 7.153, + "step": 2443, + "task_loss": 0.38315266370773315 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06363417327018872, + "compression/movement_sparsity/importance_threshold": -0.006558057944542735, + "compression/movement_sparsity/linear_layer_sparsity": 0.0035547255380947583, + "compression/movement_sparsity/model_sparsity": 0.003432609780246351, + "compression_loss": 6.872047424316406, + "distillation_loss": 0.8471643328666687, + "epoch": 2.07, + "learning_rate": 4.4078144078144076e-05, + "loss": 7.432, + "step": 2444, + "task_loss": 0.9422128796577454 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0644423140426652, + "compression/movement_sparsity/importance_threshold": -0.00655239794087541, + "compression/movement_sparsity/linear_layer_sparsity": 0.0037258611920046644, + "compression/movement_sparsity/model_sparsity": 0.0035978663979696003, + "compression_loss": 6.9592437744140625, + "distillation_loss": 0.4266647696495056, + "epoch": 2.07, + "learning_rate": 4.407344791960177e-05, + "loss": 7.4334, + "step": 2445, + "task_loss": 0.20151148736476898 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0652499896981632, + "compression/movement_sparsity/importance_threshold": -0.0065467411947640625, + "compression/movement_sparsity/linear_layer_sparsity": 0.0038073390294605164, + "compression/movement_sparsity/model_sparsity": 0.0036765452210536977, + "compression_loss": 7.046391010284424, + "distillation_loss": 0.44936418533325195, + "epoch": 2.07, + "learning_rate": 4.4068751761059455e-05, + "loss": 7.6431, + "step": 2446, + "task_loss": 0.68747878074646 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06605720037056773, + "compression/movement_sparsity/importance_threshold": -0.006541087705270998, + "compression/movement_sparsity/linear_layer_sparsity": 0.003908658681862445, + "compression/movement_sparsity/model_sparsity": 0.003774384231699911, + "compression_loss": 7.133489608764648, + "distillation_loss": 0.41012096405029297, + "epoch": 2.07, + "learning_rate": 4.406405560251715e-05, + "loss": 7.5644, + "step": 2447, + "task_loss": 0.45452436804771423 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06686394619376412, + "compression/movement_sparsity/importance_threshold": -0.006535437471458518, + "compression/movement_sparsity/linear_layer_sparsity": 0.003995848195615875, + "compression/movement_sparsity/model_sparsity": 0.003858578517429593, + "compression_loss": 7.220540523529053, + "distillation_loss": 0.7610046863555908, + "epoch": 2.07, + "learning_rate": 4.405935944397483e-05, + "loss": 7.7841, + "step": 2448, + "task_loss": 0.7570705413818359 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06767022730163741, + "compression/movement_sparsity/importance_threshold": -0.006529790492388927, + "compression/movement_sparsity/linear_layer_sparsity": 0.004108030764734074, + "compression/movement_sparsity/model_sparsity": 0.003966907270184633, + "compression_loss": 7.307538986206055, + "distillation_loss": 0.686468780040741, + "epoch": 2.07, + "learning_rate": 4.405466328543252e-05, + "loss": 7.8301, + "step": 2449, + "task_loss": 0.8293174505233765 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06847604382807271, + "compression/movement_sparsity/importance_threshold": -0.00652414676712453, + "compression/movement_sparsity/linear_layer_sparsity": 0.004233592249939712, + "compression/movement_sparsity/model_sparsity": 0.004088155332101147, + "compression_loss": 7.394484996795654, + "distillation_loss": 0.5806015729904175, + "epoch": 2.07, + "learning_rate": 4.404996712689021e-05, + "loss": 7.9085, + "step": 2450, + "task_loss": 0.9789434671401978 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.06928139590695526, + "compression/movement_sparsity/importance_threshold": -0.006518506294727631, + "compression/movement_sparsity/linear_layer_sparsity": 0.004365056198125101, + "compression/movement_sparsity/model_sparsity": 0.00421510308923596, + "compression_loss": 7.481378078460693, + "distillation_loss": 0.2894788384437561, + "epoch": 2.07, + "learning_rate": 4.4045270968347894e-05, + "loss": 7.7723, + "step": 2451, + "task_loss": 0.16470776498317719 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.07008628367217018, + "compression/movement_sparsity/importance_threshold": -0.006512869074260532, + "compression/movement_sparsity/linear_layer_sparsity": 0.004492322839302667, + "compression/movement_sparsity/model_sparsity": 0.004337997729771094, + "compression_loss": 7.568211078643799, + "distillation_loss": 0.36692577600479126, + "epoch": 2.07, + "learning_rate": 4.404057480980558e-05, + "loss": 8.0048, + "step": 2452, + "task_loss": 0.649534285068512 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.07089070725760216, + "compression/movement_sparsity/importance_threshold": -0.0065072351047855415, + "compression/movement_sparsity/linear_layer_sparsity": 0.00463599713514718, + "compression/movement_sparsity/model_sparsity": 0.004476736371559516, + "compression_loss": 7.654995441436768, + "distillation_loss": 0.36742693185806274, + "epoch": 2.07, + "learning_rate": 4.4035878651263267e-05, + "loss": 8.1637, + "step": 2453, + "task_loss": 1.0155671834945679 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0716946667971371, + "compression/movement_sparsity/importance_threshold": -0.006501604385364958, + "compression/movement_sparsity/linear_layer_sparsity": 0.004776797706591451, + "compression/movement_sparsity/model_sparsity": 0.004612700010221454, + "compression_loss": 7.741716384887695, + "distillation_loss": 0.3914766311645508, + "epoch": 2.07, + "learning_rate": 4.403118249272096e-05, + "loss": 8.2329, + "step": 2454, + "task_loss": 0.4086741507053375 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.07249816242465978, + "compression/movement_sparsity/importance_threshold": -0.006495976915061088, + "compression/movement_sparsity/linear_layer_sparsity": 0.004889755346605981, + "compression/movement_sparsity/model_sparsity": 0.004721777207803138, + "compression_loss": 7.828385829925537, + "distillation_loss": 0.3016759157180786, + "epoch": 2.08, + "learning_rate": 4.4026486334178646e-05, + "loss": 8.2195, + "step": 2455, + "task_loss": 0.37689492106437683 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.07330119427405535, + "compression/movement_sparsity/importance_threshold": -0.006490352692936236, + "compression/movement_sparsity/linear_layer_sparsity": 0.005030365131368078, + "compression/movement_sparsity/model_sparsity": 0.004857556613892363, + "compression_loss": 7.915004730224609, + "distillation_loss": 0.4386562705039978, + "epoch": 2.08, + "learning_rate": 4.402179017563633e-05, + "loss": 8.3814, + "step": 2456, + "task_loss": 0.7678865790367126 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.07410376247920902, + "compression/movement_sparsity/importance_threshold": -0.006484731718052705, + "compression/movement_sparsity/linear_layer_sparsity": 0.0052091203283973, + "compression/movement_sparsity/model_sparsity": 0.005030171019988324, + "compression_loss": 8.00157356262207, + "distillation_loss": 0.5517943501472473, + "epoch": 2.08, + "learning_rate": 4.401709401709402e-05, + "loss": 8.4303, + "step": 2457, + "task_loss": 0.8872720003128052 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.07490586717400582, + "compression/movement_sparsity/importance_threshold": -0.006479113989472799, + "compression/movement_sparsity/linear_layer_sparsity": 0.005423194909963949, + "compression/movement_sparsity/model_sparsity": 0.005236891481107716, + "compression_loss": 8.088088989257812, + "distillation_loss": 0.36474883556365967, + "epoch": 2.08, + "learning_rate": 4.4012397858551705e-05, + "loss": 8.5516, + "step": 2458, + "task_loss": 0.43918728828430176 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.07570750849233099, + "compression/movement_sparsity/importance_threshold": -0.006473499506258822, + "compression/movement_sparsity/linear_layer_sparsity": 0.005534805119035627, + "compression/movement_sparsity/model_sparsity": 0.005344667536144617, + "compression_loss": 8.174554824829102, + "distillation_loss": 0.2895936369895935, + "epoch": 2.08, + "learning_rate": 4.40077017000094e-05, + "loss": 8.6167, + "step": 2459, + "task_loss": 0.5829182863235474 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.07650868656806953, + "compression/movement_sparsity/importance_threshold": -0.006467888267473079, + "compression/movement_sparsity/linear_layer_sparsity": 0.005668940080771449, + "compression/movement_sparsity/model_sparsity": 0.005474194549297407, + "compression_loss": 8.260965347290039, + "distillation_loss": 0.5662906169891357, + "epoch": 2.08, + "learning_rate": 4.400300554146708e-05, + "loss": 8.8727, + "step": 2460, + "task_loss": 0.9015250205993652 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.07730940153510668, + "compression/movement_sparsity/importance_threshold": -0.006462280272177873, + "compression/movement_sparsity/linear_layer_sparsity": 0.005829785178011606, + "compression/movement_sparsity/model_sparsity": 0.005629514122629967, + "compression_loss": 8.347328186035156, + "distillation_loss": 0.5136330723762512, + "epoch": 2.08, + "learning_rate": 4.399830938292477e-05, + "loss": 8.8908, + "step": 2461, + "task_loss": 0.45607006549835205 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.07810965352732757, + "compression/movement_sparsity/importance_threshold": -0.006456675519435509, + "compression/movement_sparsity/linear_layer_sparsity": 0.006043657048728445, + "compression/movement_sparsity/model_sparsity": 0.005836038836640851, + "compression_loss": 8.433639526367188, + "distillation_loss": 0.4142168462276459, + "epoch": 2.08, + "learning_rate": 4.399361322438246e-05, + "loss": 9.1706, + "step": 2462, + "task_loss": 0.4710100591182709 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0789094426786171, + "compression/movement_sparsity/importance_threshold": -0.006451074008308291, + "compression/movement_sparsity/linear_layer_sparsity": 0.006269739267104408, + "compression/movement_sparsity/model_sparsity": 0.006054354435305344, + "compression_loss": 8.519902229309082, + "distillation_loss": 0.4787374436855316, + "epoch": 2.08, + "learning_rate": 4.3988917065840143e-05, + "loss": 9.1018, + "step": 2463, + "task_loss": 0.45049142837524414 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.07970876912286096, + "compression/movement_sparsity/importance_threshold": -0.006445475737858521, + "compression/movement_sparsity/linear_layer_sparsity": 0.006413592425463459, + "compression/movement_sparsity/model_sparsity": 0.006193265795130685, + "compression_loss": 8.606109619140625, + "distillation_loss": 0.6395452618598938, + "epoch": 2.08, + "learning_rate": 4.398422090729784e-05, + "loss": 9.0268, + "step": 2464, + "task_loss": 0.16611601412296295 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.08050763299394392, + "compression/movement_sparsity/importance_threshold": -0.006439880707148503, + "compression/movement_sparsity/linear_layer_sparsity": 0.006644444310893767, + "compression/movement_sparsity/model_sparsity": 0.006416187208112994, + "compression_loss": 8.692270278930664, + "distillation_loss": 0.6031801700592041, + "epoch": 2.08, + "learning_rate": 4.3979524748755516e-05, + "loss": 9.3505, + "step": 2465, + "task_loss": 0.15452896058559418 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.08130603442575124, + "compression/movement_sparsity/importance_threshold": -0.006434288915240543, + "compression/movement_sparsity/linear_layer_sparsity": 0.006847441340726699, + "compression/movement_sparsity/model_sparsity": 0.006612210665479257, + "compression_loss": 8.778373718261719, + "distillation_loss": 0.7114195823669434, + "epoch": 2.08, + "learning_rate": 4.397482859021321e-05, + "loss": 9.221, + "step": 2466, + "task_loss": 0.7676072120666504 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.08210397355216792, + "compression/movement_sparsity/importance_threshold": -0.006428700361196945, + "compression/movement_sparsity/linear_layer_sparsity": 0.007123879319028911, + "compression/movement_sparsity/model_sparsity": 0.006879152148804096, + "compression_loss": 8.864429473876953, + "distillation_loss": 0.43020790815353394, + "epoch": 2.09, + "learning_rate": 4.3970132431670896e-05, + "loss": 9.4159, + "step": 2467, + "task_loss": 0.6674639582633972 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0829014505070792, + "compression/movement_sparsity/importance_threshold": -0.006423115044080011, + "compression/movement_sparsity/linear_layer_sparsity": 0.007319829165789048, + "compression/movement_sparsity/model_sparsity": 0.0070683705155157855, + "compression_loss": 8.950435638427734, + "distillation_loss": 0.4351471960544586, + "epoch": 2.09, + "learning_rate": 4.396543627312858e-05, + "loss": 9.4226, + "step": 2468, + "task_loss": 1.3550218343734741 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0836984654243702, + "compression/movement_sparsity/importance_threshold": -0.006417532962952046, + "compression/movement_sparsity/linear_layer_sparsity": 0.0074594730929726395, + "compression/movement_sparsity/model_sparsity": 0.007203217244205653, + "compression_loss": 9.036383628845215, + "distillation_loss": 0.3953562080860138, + "epoch": 2.09, + "learning_rate": 4.396074011458627e-05, + "loss": 9.4771, + "step": 2469, + "task_loss": 0.12599721550941467 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.08449501843792617, + "compression/movement_sparsity/importance_threshold": -0.006411954116875354, + "compression/movement_sparsity/linear_layer_sparsity": 0.007694915782942755, + "compression/movement_sparsity/model_sparsity": 0.007430571753468861, + "compression_loss": 9.122285842895508, + "distillation_loss": 0.7781376838684082, + "epoch": 2.09, + "learning_rate": 4.3956043956043955e-05, + "loss": 9.7639, + "step": 2470, + "task_loss": 0.8043678998947144 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.085291109681632, + "compression/movement_sparsity/importance_threshold": -0.00640637850491224, + "compression/movement_sparsity/linear_layer_sparsity": 0.007765506855347064, + "compression/movement_sparsity/model_sparsity": 0.007498737805372542, + "compression_loss": 9.208134651184082, + "distillation_loss": 0.3136378824710846, + "epoch": 2.09, + "learning_rate": 4.395134779750165e-05, + "loss": 9.7037, + "step": 2471, + "task_loss": 0.9936558604240417 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.08608673928937305, + "compression/movement_sparsity/importance_threshold": -0.0064008061261250065, + "compression/movement_sparsity/linear_layer_sparsity": 0.007930728122109583, + "compression/movement_sparsity/model_sparsity": 0.0076582832133416994, + "compression_loss": 9.293932914733887, + "distillation_loss": 0.9721150994300842, + "epoch": 2.09, + "learning_rate": 4.3946651638959334e-05, + "loss": 10.092, + "step": 2472, + "task_loss": 0.48359057307243347 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.08688190739503443, + "compression/movement_sparsity/importance_threshold": -0.006395236979575958, + "compression/movement_sparsity/linear_layer_sparsity": 0.008300663498844596, + "compression/movement_sparsity/model_sparsity": 0.008015510171831533, + "compression_loss": 9.379682540893555, + "distillation_loss": 0.5698330402374268, + "epoch": 2.09, + "learning_rate": 4.394195548041702e-05, + "loss": 9.9619, + "step": 2473, + "task_loss": 1.0545095205307007 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.08767661413250083, + "compression/movement_sparsity/importance_threshold": -0.006389671064327401, + "compression/movement_sparsity/linear_layer_sparsity": 0.008496219848072748, + "compression/movement_sparsity/model_sparsity": 0.008204348558862002, + "compression_loss": 9.465387344360352, + "distillation_loss": 0.42641210556030273, + "epoch": 2.09, + "learning_rate": 4.393725932187471e-05, + "loss": 10.0066, + "step": 2474, + "task_loss": 0.5793523192405701 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.08847085963565826, + "compression/movement_sparsity/importance_threshold": -0.006384108379441634, + "compression/movement_sparsity/linear_layer_sparsity": 0.008688926321235932, + "compression/movement_sparsity/model_sparsity": 0.008390434971837576, + "compression_loss": 9.551039695739746, + "distillation_loss": 0.5268167853355408, + "epoch": 2.09, + "learning_rate": 4.393256316333239e-05, + "loss": 10.2528, + "step": 2475, + "task_loss": 0.6399965286254883 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.0892646440383913, + "compression/movement_sparsity/importance_threshold": -0.006378548923980964, + "compression/movement_sparsity/linear_layer_sparsity": 0.00892911482992512, + "compression/movement_sparsity/model_sparsity": 0.00862237226634701, + "compression_loss": 9.636655807495117, + "distillation_loss": 0.6597421169281006, + "epoch": 2.09, + "learning_rate": 4.3927867004790086e-05, + "loss": 10.2403, + "step": 2476, + "task_loss": 0.44541287422180176 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.09005796747458517, + "compression/movement_sparsity/importance_threshold": -0.0063729926970076956, + "compression/movement_sparsity/linear_layer_sparsity": 0.009152335248068475, + "compression/movement_sparsity/model_sparsity": 0.008837924376420815, + "compression_loss": 9.722213745117188, + "distillation_loss": 0.7705314755439758, + "epoch": 2.09, + "learning_rate": 4.3923170846247766e-05, + "loss": 10.3992, + "step": 2477, + "task_loss": 0.7143381834030151 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.090850830078125, + "compression/movement_sparsity/importance_threshold": -0.006367439697584132, + "compression/movement_sparsity/linear_layer_sparsity": 0.009376688462137238, + "compression/movement_sparsity/model_sparsity": 0.0090545703673951, + "compression_loss": 9.807727813720703, + "distillation_loss": 0.3771783411502838, + "epoch": 2.09, + "learning_rate": 4.391847468770546e-05, + "loss": 10.3228, + "step": 2478, + "task_loss": 0.9188374280929565 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.09164323198289603, + "compression/movement_sparsity/importance_threshold": -0.006361889924772577, + "compression/movement_sparsity/linear_layer_sparsity": 0.009572065948850853, + "compression/movement_sparsity/model_sparsity": 0.009243236036388651, + "compression_loss": 9.8931884765625, + "distillation_loss": 0.5092464685440063, + "epoch": 2.1, + "learning_rate": 4.3913778529163145e-05, + "loss": 10.4778, + "step": 2479, + "task_loss": 0.5467199087142944 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.09243517332278328, + "compression/movement_sparsity/importance_threshold": -0.006356343377635335, + "compression/movement_sparsity/linear_layer_sparsity": 0.00977375132024384, + "compression/movement_sparsity/model_sparsity": 0.009437992894817513, + "compression_loss": 9.978598594665527, + "distillation_loss": 0.44155019521713257, + "epoch": 2.1, + "learning_rate": 4.390908237062084e-05, + "loss": 10.479, + "step": 2480, + "task_loss": 1.1968857049942017 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.09322665423167198, + "compression/movement_sparsity/importance_threshold": -0.006350800055234709, + "compression/movement_sparsity/linear_layer_sparsity": 0.009949453930358282, + "compression/movement_sparsity/model_sparsity": 0.009607659579750072, + "compression_loss": 10.063962936401367, + "distillation_loss": 0.5727531313896179, + "epoch": 2.1, + "learning_rate": 4.3904386212078525e-05, + "loss": 10.6191, + "step": 2481, + "task_loss": 0.5607504844665527 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.09401767484344714, + "compression/movement_sparsity/importance_threshold": -0.006345259956633005, + "compression/movement_sparsity/linear_layer_sparsity": 0.010137581523149293, + "compression/movement_sparsity/model_sparsity": 0.009789324410980542, + "compression_loss": 10.149272918701172, + "distillation_loss": 0.5028717517852783, + "epoch": 2.1, + "learning_rate": 4.389969005353621e-05, + "loss": 10.7672, + "step": 2482, + "task_loss": 1.2020835876464844 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.09480823529199378, + "compression/movement_sparsity/importance_threshold": -0.006339723080892528, + "compression/movement_sparsity/linear_layer_sparsity": 0.010396288264176977, + "compression/movement_sparsity/model_sparsity": 0.0100391437795789, + "compression_loss": 10.234527587890625, + "distillation_loss": 0.5130756497383118, + "epoch": 2.1, + "learning_rate": 4.38949938949939e-05, + "loss": 10.661, + "step": 2483, + "task_loss": 0.3096490502357483 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.09559833571119758, + "compression/movement_sparsity/importance_threshold": -0.006334189427075577, + "compression/movement_sparsity/linear_layer_sparsity": 0.010668159286274653, + "compression/movement_sparsity/model_sparsity": 0.010301675195694429, + "compression_loss": 10.31973648071289, + "distillation_loss": 0.42230138182640076, + "epoch": 2.1, + "learning_rate": 4.3890297736451584e-05, + "loss": 10.7504, + "step": 2484, + "task_loss": 0.5389121770858765 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.09638797623494333, + "compression/movement_sparsity/importance_threshold": -0.0063286589942444585, + "compression/movement_sparsity/linear_layer_sparsity": 0.010863536772988269, + "compression/movement_sparsity/model_sparsity": 0.01049034086468798, + "compression_loss": 10.404875755310059, + "distillation_loss": 0.384593665599823, + "epoch": 2.1, + "learning_rate": 4.388560157790928e-05, + "loss": 10.9139, + "step": 2485, + "task_loss": 0.30024608969688416 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.09717715699711615, + "compression/movement_sparsity/importance_threshold": -0.006323131781461477, + "compression/movement_sparsity/linear_layer_sparsity": 0.011040586814045563, + "compression/movement_sparsity/model_sparsity": 0.010661308692165323, + "compression_loss": 10.489977836608887, + "distillation_loss": 0.5715488791465759, + "epoch": 2.1, + "learning_rate": 4.3880905419366956e-05, + "loss": 11.0848, + "step": 2486, + "task_loss": 1.1164438724517822 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.09796587813160129, + "compression/movement_sparsity/importance_threshold": -0.006317607787788937, + "compression/movement_sparsity/linear_layer_sparsity": 0.011213057974730685, + "compression/movement_sparsity/model_sparsity": 0.01082785493789756, + "compression_loss": 10.575029373168945, + "distillation_loss": 0.6510967016220093, + "epoch": 2.1, + "learning_rate": 4.387620926082465e-05, + "loss": 11.1015, + "step": 2487, + "task_loss": 0.9710118770599365 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.09875413977228376, + "compression/movement_sparsity/importance_threshold": -0.006312087012289141, + "compression/movement_sparsity/linear_layer_sparsity": 0.011445436153618383, + "compression/movement_sparsity/model_sparsity": 0.011052250211461572, + "compression_loss": 10.660024642944336, + "distillation_loss": 1.3057067394256592, + "epoch": 2.1, + "learning_rate": 4.3871513102282336e-05, + "loss": 11.4003, + "step": 2488, + "task_loss": 2.293001413345337 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.09954194205304878, + "compression/movement_sparsity/importance_threshold": -0.006306569454024393, + "compression/movement_sparsity/linear_layer_sparsity": 0.011594070903199416, + "compression/movement_sparsity/model_sparsity": 0.011195778900140523, + "compression_loss": 10.744974136352539, + "distillation_loss": 0.7082593441009521, + "epoch": 2.1, + "learning_rate": 4.386681694374002e-05, + "loss": 11.324, + "step": 2489, + "task_loss": 2.051161527633667 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1003292851077815, + "compression/movement_sparsity/importance_threshold": -0.006301055112056999, + "compression/movement_sparsity/linear_layer_sparsity": 0.0118046994003193, + "compression/movement_sparsity/model_sparsity": 0.011399171660415292, + "compression_loss": 10.829861640930176, + "distillation_loss": 0.41983428597450256, + "epoch": 2.1, + "learning_rate": 4.386212078519771e-05, + "loss": 11.2578, + "step": 2490, + "task_loss": 0.16304424405097961 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.10111616907036702, + "compression/movement_sparsity/importance_threshold": -0.006295543985449261, + "compression/movement_sparsity/linear_layer_sparsity": 0.012079634933499395, + "compression/movement_sparsity/model_sparsity": 0.011664662312230017, + "compression_loss": 10.914708137512207, + "distillation_loss": 0.3453892767429352, + "epoch": 2.11, + "learning_rate": 4.3857424626655395e-05, + "loss": 11.4853, + "step": 2491, + "task_loss": 0.9453598856925964 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.10190259407469049, + "compression/movement_sparsity/importance_threshold": -0.0062900360732634835, + "compression/movement_sparsity/linear_layer_sparsity": 0.012377655655222518, + "compression/movement_sparsity/model_sparsity": 0.011952445105342976, + "compression_loss": 10.999489784240723, + "distillation_loss": 0.6023022532463074, + "epoch": 2.11, + "learning_rate": 4.385272846811309e-05, + "loss": 11.6984, + "step": 2492, + "task_loss": 1.7473595142364502 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.10268856025463713, + "compression/movement_sparsity/importance_threshold": -0.00628453137456197, + "compression/movement_sparsity/linear_layer_sparsity": 0.012579507964962407, + "compression/movement_sparsity/model_sparsity": 0.012147363167272962, + "compression_loss": 11.084223747253418, + "distillation_loss": 0.506463348865509, + "epoch": 2.11, + "learning_rate": 4.3848032309570774e-05, + "loss": 11.6328, + "step": 2493, + "task_loss": 0.31890761852264404 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.10347406774409162, + "compression/movement_sparsity/importance_threshold": -0.006279029888407028, + "compression/movement_sparsity/linear_layer_sparsity": 0.012825813571648794, + "compression/movement_sparsity/model_sparsity": 0.012385207418644998, + "compression_loss": 11.168905258178711, + "distillation_loss": 0.4475681781768799, + "epoch": 2.11, + "learning_rate": 4.384333615102846e-05, + "loss": 11.7254, + "step": 2494, + "task_loss": 0.8881322741508484 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.10425911667693977, + "compression/movement_sparsity/importance_threshold": -0.006273531613860957, + "compression/movement_sparsity/linear_layer_sparsity": 0.013013547666907821, + "compression/movement_sparsity/model_sparsity": 0.012566492270194247, + "compression_loss": 11.253530502319336, + "distillation_loss": 0.37442925572395325, + "epoch": 2.11, + "learning_rate": 4.383863999248615e-05, + "loss": 11.8431, + "step": 2495, + "task_loss": 0.49670886993408203 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.10504370718706657, + "compression/movement_sparsity/importance_threshold": -0.006268036549986061, + "compression/movement_sparsity/linear_layer_sparsity": 0.013253378450567933, + "compression/movement_sparsity/model_sparsity": 0.012798084128629846, + "compression_loss": 11.338117599487305, + "distillation_loss": 0.5494994521141052, + "epoch": 2.11, + "learning_rate": 4.383394383394383e-05, + "loss": 11.9346, + "step": 2496, + "task_loss": 0.31456267833709717 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.10582783940835694, + "compression/movement_sparsity/importance_threshold": -0.006262544695844647, + "compression/movement_sparsity/linear_layer_sparsity": 0.013474869864404089, + "compression/movement_sparsity/model_sparsity": 0.013011966631013443, + "compression_loss": 11.422650337219238, + "distillation_loss": 1.0991010665893555, + "epoch": 2.11, + "learning_rate": 4.3829247675401526e-05, + "loss": 12.0807, + "step": 2497, + "task_loss": 1.6813899278640747 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1066115134746961, + "compression/movement_sparsity/importance_threshold": -0.006257056050499016, + "compression/movement_sparsity/linear_layer_sparsity": 0.013746740886501766, + "compression/movement_sparsity/model_sparsity": 0.013274498047128972, + "compression_loss": 11.507121086120605, + "distillation_loss": 0.5585469007492065, + "epoch": 2.11, + "learning_rate": 4.382455151685921e-05, + "loss": 11.9785, + "step": 2498, + "task_loss": 0.7304984927177429 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1073947295199692, + "compression/movement_sparsity/importance_threshold": -0.006251570613011475, + "compression/movement_sparsity/linear_layer_sparsity": 0.014023190788971613, + "compression/movement_sparsity/model_sparsity": 0.013541451044989605, + "compression_loss": 11.591545104980469, + "distillation_loss": 0.45181185007095337, + "epoch": 2.11, + "learning_rate": 4.38198553583169e-05, + "loss": 11.9796, + "step": 2499, + "task_loss": 0.9213634133338928 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.10817748767806146, + "compression/movement_sparsity/importance_threshold": -0.006246088382444325, + "compression/movement_sparsity/linear_layer_sparsity": 0.014224637677011884, + "compression/movement_sparsity/model_sparsity": 0.013735977612702578, + "compression_loss": 11.675919532775879, + "distillation_loss": 0.541807234287262, + "epoch": 2.11, + "learning_rate": 4.3815159199774585e-05, + "loss": 12.263, + "step": 2500, + "task_loss": 0.557203471660614 + }, + { + "epoch": 2.11, + "eval_accuracy": 0.9057425742574258, + "eval_loss": 12.00404167175293, + "eval_runtime": 228.1482, + "eval_samples_per_second": 110.674, + "eval_steps_per_second": 0.868, + "step": 2500 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.10895978808285789, + "compression/movement_sparsity/importance_threshold": -0.006240609357859872, + "compression/movement_sparsity/linear_layer_sparsity": 0.01456655125980262, + "compression/movement_sparsity/model_sparsity": 0.01406614541207524, + "compression_loss": 11.760249137878418, + "distillation_loss": 0.23940984904766083, + "epoch": 2.11, + "learning_rate": 4.381046304123227e-05, + "loss": 12.2371, + "step": 2501, + "task_loss": 0.22043900191783905 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.10974163086824362, + "compression/movement_sparsity/importance_threshold": -0.00623513353832042, + "compression/movement_sparsity/linear_layer_sparsity": 0.014817066097664465, + "compression/movement_sparsity/model_sparsity": 0.014308054294582748, + "compression_loss": 11.844521522521973, + "distillation_loss": 0.5588322877883911, + "epoch": 2.11, + "learning_rate": 4.3805766882689965e-05, + "loss": 12.4128, + "step": 2502, + "task_loss": 0.40949487686157227 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.110523016168104, + "compression/movement_sparsity/importance_threshold": -0.006229660922888272, + "compression/movement_sparsity/linear_layer_sparsity": 0.015096389724534557, + "compression/movement_sparsity/model_sparsity": 0.014577782295569866, + "compression_loss": 11.928736686706543, + "distillation_loss": 0.5555133819580078, + "epoch": 2.12, + "learning_rate": 4.3801070724147645e-05, + "loss": 12.4085, + "step": 2503, + "task_loss": 0.6480487585067749 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1113039441163236, + "compression/movement_sparsity/importance_threshold": -0.006224191510625734, + "compression/movement_sparsity/linear_layer_sparsity": 0.015337544090802251, + "compression/movement_sparsity/model_sparsity": 0.01481065226747866, + "compression_loss": 12.012903213500977, + "distillation_loss": 0.6601122617721558, + "epoch": 2.12, + "learning_rate": 4.379637456560534e-05, + "loss": 12.6061, + "step": 2504, + "task_loss": 1.199681282043457 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.11208441484678833, + "compression/movement_sparsity/importance_threshold": -0.0062187253005951066, + "compression/movement_sparsity/linear_layer_sparsity": 0.015536355737794995, + "compression/movement_sparsity/model_sparsity": 0.015002634122781038, + "compression_loss": 12.097017288208008, + "distillation_loss": 0.5549905896186829, + "epoch": 2.12, + "learning_rate": 4.3791678407063024e-05, + "loss": 12.6452, + "step": 2505, + "task_loss": 0.6281686425209045 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.11286442849338296, + "compression/movement_sparsity/importance_threshold": -0.006213262291858696, + "compression/movement_sparsity/linear_layer_sparsity": 0.01579906899914833, + "compression/movement_sparsity/model_sparsity": 0.015256322375406361, + "compression_loss": 12.181082725524902, + "distillation_loss": 0.7515906691551208, + "epoch": 2.12, + "learning_rate": 4.378698224852072e-05, + "loss": 12.744, + "step": 2506, + "task_loss": 0.7625669836997986 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.11364398518999275, + "compression/movement_sparsity/importance_threshold": -0.006207802483478804, + "compression/movement_sparsity/linear_layer_sparsity": 0.016079704284458363, + "compression/movement_sparsity/model_sparsity": 0.015527316975330878, + "compression_loss": 12.265105247497559, + "distillation_loss": 0.48221731185913086, + "epoch": 2.12, + "learning_rate": 4.37822860899784e-05, + "loss": 12.8517, + "step": 2507, + "task_loss": 0.27545469999313354 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1144230850705027, + "compression/movement_sparsity/importance_threshold": -0.0062023458745177375, + "compression/movement_sparsity/linear_layer_sparsity": 0.016428092690275373, + "compression/movement_sparsity/model_sparsity": 0.015863737167639978, + "compression_loss": 12.349076271057129, + "distillation_loss": 0.7552988529205322, + "epoch": 2.12, + "learning_rate": 4.377758993143608e-05, + "loss": 12.9785, + "step": 2508, + "task_loss": 0.7426744699478149 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.11520172826879793, + "compression/movement_sparsity/importance_threshold": -0.006196892464037799, + "compression/movement_sparsity/linear_layer_sparsity": 0.016653984121969164, + "compression/movement_sparsity/model_sparsity": 0.01608186853373176, + "compression_loss": 12.432995796203613, + "distillation_loss": 0.7691762447357178, + "epoch": 2.12, + "learning_rate": 4.3772893772893776e-05, + "loss": 12.9973, + "step": 2509, + "task_loss": 0.7293866872787476 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1159799149187637, + "compression/movement_sparsity/importance_threshold": -0.006191442251101293, + "compression/movement_sparsity/linear_layer_sparsity": 0.016889224101089468, + "compression/movement_sparsity/model_sparsity": 0.016309027295886458, + "compression_loss": 12.516862869262695, + "distillation_loss": 0.3465961813926697, + "epoch": 2.12, + "learning_rate": 4.376819761435146e-05, + "loss": 12.8938, + "step": 2510, + "task_loss": 0.1730797290802002 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1167576451542851, + "compression/movement_sparsity/importance_threshold": -0.006185995234770523, + "compression/movement_sparsity/linear_layer_sparsity": 0.01728646582171061, + "compression/movement_sparsity/model_sparsity": 0.016692622541345792, + "compression_loss": 12.600663185119629, + "distillation_loss": 0.7592371106147766, + "epoch": 2.12, + "learning_rate": 4.3763501455809156e-05, + "loss": 13.2876, + "step": 2511, + "task_loss": 1.2177231311798096 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1175349191092473, + "compression/movement_sparsity/importance_threshold": -0.0061805514141077935, + "compression/movement_sparsity/linear_layer_sparsity": 0.017675682577512813, + "compression/movement_sparsity/model_sparsity": 0.0170684685042154, + "compression_loss": 12.684412002563477, + "distillation_loss": 0.5324851274490356, + "epoch": 2.12, + "learning_rate": 4.3758805297266835e-05, + "loss": 13.2559, + "step": 2512, + "task_loss": 0.3248615860939026 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.11831173691753538, + "compression/movement_sparsity/importance_threshold": -0.006175110788175408, + "compression/movement_sparsity/linear_layer_sparsity": 0.017997933207872012, + "compression/movement_sparsity/model_sparsity": 0.01737964883406286, + "compression_loss": 12.76811408996582, + "distillation_loss": 0.1859724223613739, + "epoch": 2.12, + "learning_rate": 4.375410913872453e-05, + "loss": 13.1678, + "step": 2513, + "task_loss": 0.09421936422586441 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.11908809871303416, + "compression/movement_sparsity/importance_threshold": -0.006169673356035673, + "compression/movement_sparsity/linear_layer_sparsity": 0.018382547234966774, + "compression/movement_sparsity/model_sparsity": 0.017751050186115776, + "compression_loss": 12.851753234863281, + "distillation_loss": 0.5571901202201843, + "epoch": 2.13, + "learning_rate": 4.3749412980182215e-05, + "loss": 13.3425, + "step": 2514, + "task_loss": 1.1220035552978516 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.11986400462962954, + "compression/movement_sparsity/importance_threshold": -0.006164239116750887, + "compression/movement_sparsity/linear_layer_sparsity": 0.0186557656880073, + "compression/movement_sparsity/model_sparsity": 0.018014882744776087, + "compression_loss": 12.935338020324707, + "distillation_loss": 0.5262259840965271, + "epoch": 2.13, + "learning_rate": 4.37447168216399e-05, + "loss": 13.4572, + "step": 2515, + "task_loss": 0.5436557531356812 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.12063945480120619, + "compression/movement_sparsity/importance_threshold": -0.0061588080693833576, + "compression/movement_sparsity/linear_layer_sparsity": 0.018965603259857567, + "compression/movement_sparsity/model_sparsity": 0.018314076442861437, + "compression_loss": 13.018871307373047, + "distillation_loss": 0.9457110166549683, + "epoch": 2.13, + "learning_rate": 4.374002066309759e-05, + "loss": 13.5598, + "step": 2516, + "task_loss": 1.0214468240737915 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.12141444936164936, + "compression/movement_sparsity/importance_threshold": -0.006153380212995388, + "compression/movement_sparsity/linear_layer_sparsity": 0.019454017166222515, + "compression/movement_sparsity/model_sparsity": 0.018785711829005826, + "compression_loss": 13.102352142333984, + "distillation_loss": 0.7864676117897034, + "epoch": 2.13, + "learning_rate": 4.3735324504555274e-05, + "loss": 13.635, + "step": 2517, + "task_loss": 1.5607138872146606 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.12218898844484405, + "compression/movement_sparsity/importance_threshold": -0.006147955546649283, + "compression/movement_sparsity/linear_layer_sparsity": 0.019872436208564947, + "compression/movement_sparsity/model_sparsity": 0.019189756890036262, + "compression_loss": 13.185774803161621, + "distillation_loss": 0.3986533582210541, + "epoch": 2.13, + "learning_rate": 4.373062834601297e-05, + "loss": 13.6547, + "step": 2518, + "task_loss": 0.4902108311653137 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.12296307218467561, + "compression/movement_sparsity/importance_threshold": -0.006142534069407346, + "compression/movement_sparsity/linear_layer_sparsity": 0.02015060319117436, + "compression/movement_sparsity/model_sparsity": 0.01945836798105131, + "compression_loss": 13.269147872924805, + "distillation_loss": 0.5591965317726135, + "epoch": 2.13, + "learning_rate": 4.372593218747065e-05, + "loss": 13.8165, + "step": 2519, + "task_loss": 0.47914987802505493 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.12373670071502896, + "compression/movement_sparsity/importance_threshold": -0.006137115780331881, + "compression/movement_sparsity/linear_layer_sparsity": 0.020604103134701502, + "compression/movement_sparsity/model_sparsity": 0.019896288806389287, + "compression_loss": 13.352471351623535, + "distillation_loss": 0.39664262533187866, + "epoch": 2.13, + "learning_rate": 4.372123602892834e-05, + "loss": 13.9905, + "step": 2520, + "task_loss": 0.36817216873168945 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.12450987416978943, + "compression/movement_sparsity/importance_threshold": -0.006131700678485191, + "compression/movement_sparsity/linear_layer_sparsity": 0.020872730783202224, + "compression/movement_sparsity/model_sparsity": 0.020155688268768703, + "compression_loss": 13.435741424560547, + "distillation_loss": 0.44774866104125977, + "epoch": 2.13, + "learning_rate": 4.3716539870386026e-05, + "loss": 14.0686, + "step": 2521, + "task_loss": 0.8100059032440186 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.12528259268284203, + "compression/movement_sparsity/importance_threshold": -0.006126288762929582, + "compression/movement_sparsity/linear_layer_sparsity": 0.02121614681111508, + "compression/movement_sparsity/model_sparsity": 0.02048730689965148, + "compression_loss": 13.518967628479004, + "distillation_loss": 0.4800080358982086, + "epoch": 2.13, + "learning_rate": 4.371184371184371e-05, + "loss": 14.028, + "step": 2522, + "task_loss": 0.7068597078323364 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.12605485638807157, + "compression/movement_sparsity/importance_threshold": -0.0061208800327273594, + "compression/movement_sparsity/linear_layer_sparsity": 0.021501575611814733, + "compression/movement_sparsity/model_sparsity": 0.0207629303429654, + "compression_loss": 13.602137565612793, + "distillation_loss": 0.4840482771396637, + "epoch": 2.13, + "learning_rate": 4.3707147553301405e-05, + "loss": 14.045, + "step": 2523, + "task_loss": 0.6037019491195679 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.12682666541936405, + "compression/movement_sparsity/importance_threshold": -0.006115474486940822, + "compression/movement_sparsity/linear_layer_sparsity": 0.021842141763662615, + "compression/movement_sparsity/model_sparsity": 0.02109179699979328, + "compression_loss": 13.685258865356445, + "distillation_loss": 0.4864698648452759, + "epoch": 2.13, + "learning_rate": 4.370245139475909e-05, + "loss": 14.1694, + "step": 2524, + "task_loss": 0.7226937413215637 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.12759801991060393, + "compression/movement_sparsity/importance_threshold": -0.006110072124632277, + "compression/movement_sparsity/linear_layer_sparsity": 0.02225289356621519, + "compression/movement_sparsity/model_sparsity": 0.021488438214307827, + "compression_loss": 13.76832389831543, + "distillation_loss": 0.89805006980896, + "epoch": 2.13, + "learning_rate": 4.369775523621678e-05, + "loss": 14.3239, + "step": 2525, + "task_loss": 1.6137539148330688 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.12836891999567668, + "compression/movement_sparsity/importance_threshold": -0.0061046729448640275, + "compression/movement_sparsity/linear_layer_sparsity": 0.02256770351596961, + "compression/movement_sparsity/model_sparsity": 0.021792433473819497, + "compression_loss": 13.851346969604492, + "distillation_loss": 0.6718326807022095, + "epoch": 2.14, + "learning_rate": 4.3693059077674464e-05, + "loss": 14.3839, + "step": 2526, + "task_loss": 0.7194912433624268 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1291393658084673, + "compression/movement_sparsity/importance_threshold": -0.0060992769466983775, + "compression/movement_sparsity/linear_layer_sparsity": 0.022908055032800047, + "compression/movement_sparsity/model_sparsity": 0.022121092869003076, + "compression_loss": 13.934311866760254, + "distillation_loss": 0.5152491331100464, + "epoch": 2.14, + "learning_rate": 4.368836291913215e-05, + "loss": 14.4245, + "step": 2527, + "task_loss": 0.9203323125839233 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.12990935748286092, + "compression/movement_sparsity/importance_threshold": -0.006093884129197632, + "compression/movement_sparsity/linear_layer_sparsity": 0.02323011487647707, + "compression/movement_sparsity/model_sparsity": 0.022432088966277828, + "compression_loss": 14.017224311828613, + "distillation_loss": 0.655097484588623, + "epoch": 2.14, + "learning_rate": 4.3683666760589844e-05, + "loss": 14.5106, + "step": 2528, + "task_loss": 0.7409726977348328 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.13067889515274267, + "compression/movement_sparsity/importance_threshold": -0.006088494491424094, + "compression/movement_sparsity/linear_layer_sparsity": 0.02372025778714922, + "compression/movement_sparsity/model_sparsity": 0.022905393960112427, + "compression_loss": 14.100090026855469, + "distillation_loss": 0.43207496404647827, + "epoch": 2.14, + "learning_rate": 4.367897060204752e-05, + "loss": 14.701, + "step": 2529, + "task_loss": 0.570621132850647 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.13144797895199778, + "compression/movement_sparsity/importance_threshold": -0.006083108032440067, + "compression/movement_sparsity/linear_layer_sparsity": 0.024085435420997524, + "compression/movement_sparsity/model_sparsity": 0.02325802661882024, + "compression_loss": 14.182893753051758, + "distillation_loss": 0.621110737323761, + "epoch": 2.14, + "learning_rate": 4.3674274443505216e-05, + "loss": 14.6946, + "step": 2530, + "task_loss": 1.1413514614105225 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.13221660901451127, + "compression/movement_sparsity/importance_threshold": -0.006077724751307858, + "compression/movement_sparsity/linear_layer_sparsity": 0.024496020285203195, + "compression/movement_sparsity/model_sparsity": 0.023654506629833663, + "compression_loss": 14.265647888183594, + "distillation_loss": 0.7209075689315796, + "epoch": 2.14, + "learning_rate": 4.36695782849629e-05, + "loss": 14.8499, + "step": 2531, + "task_loss": 1.5845314264297485 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.13298478547416837, + "compression/movement_sparsity/importance_threshold": -0.006072344647089767, + "compression/movement_sparsity/linear_layer_sparsity": 0.024894561740096646, + "compression/movement_sparsity/model_sparsity": 0.0240393569596946, + "compression_loss": 14.348350524902344, + "distillation_loss": 0.879779577255249, + "epoch": 2.14, + "learning_rate": 4.366488212642059e-05, + "loss": 14.8737, + "step": 2532, + "task_loss": 0.9149445295333862 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1337525084648542, + "compression/movement_sparsity/importance_threshold": -0.0060669677188481, + "compression/movement_sparsity/linear_layer_sparsity": 0.02528680723447836, + "compression/movement_sparsity/model_sparsity": 0.02441812761465602, + "compression_loss": 14.43100357055664, + "distillation_loss": 0.8005964756011963, + "epoch": 2.14, + "learning_rate": 4.3660185967878275e-05, + "loss": 14.9526, + "step": 2533, + "task_loss": 2.118946075439453 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.13451977812045357, + "compression/movement_sparsity/importance_threshold": -0.006061593965645163, + "compression/movement_sparsity/linear_layer_sparsity": 0.02565349923761642, + "compression/movement_sparsity/model_sparsity": 0.02477222261940974, + "compression_loss": 14.513608932495117, + "distillation_loss": 0.4513401985168457, + "epoch": 2.14, + "learning_rate": 4.365548980933596e-05, + "loss": 15.0254, + "step": 2534, + "task_loss": 1.3878161907196045 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.13528659457485237, + "compression/movement_sparsity/importance_threshold": -0.006056223386543255, + "compression/movement_sparsity/linear_layer_sparsity": 0.02606561039527948, + "compression/movement_sparsity/model_sparsity": 0.02517017649100486, + "compression_loss": 14.596161842346191, + "distillation_loss": 0.36565977334976196, + "epoch": 2.14, + "learning_rate": 4.3650793650793655e-05, + "loss": 15.1483, + "step": 2535, + "task_loss": 0.2199409157037735 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.13605295796193517, + "compression/movement_sparsity/importance_threshold": -0.006050855980604683, + "compression/movement_sparsity/linear_layer_sparsity": 0.02647772155294254, + "compression/movement_sparsity/model_sparsity": 0.025568130362599986, + "compression_loss": 14.67866325378418, + "distillation_loss": 0.6575306057929993, + "epoch": 2.14, + "learning_rate": 4.364609749225134e-05, + "loss": 15.1849, + "step": 2536, + "task_loss": 0.9486361145973206 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.13681886841558732, + "compression/movement_sparsity/importance_threshold": -0.006045491746891751, + "compression/movement_sparsity/linear_layer_sparsity": 0.026931221496469683, + "compression/movement_sparsity/model_sparsity": 0.026006051187937963, + "compression_loss": 14.761101722717285, + "distillation_loss": 0.49703025817871094, + "epoch": 2.14, + "learning_rate": 4.364140133370903e-05, + "loss": 15.3274, + "step": 2537, + "task_loss": 0.5364235043525696 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.13758432606969384, + "compression/movement_sparsity/importance_threshold": -0.006040130684466762, + "compression/movement_sparsity/linear_layer_sparsity": 0.02735706929524926, + "compression/movement_sparsity/model_sparsity": 0.026417269804768398, + "compression_loss": 14.843487739562988, + "distillation_loss": 0.5222068428993225, + "epoch": 2.15, + "learning_rate": 4.3636705175166714e-05, + "loss": 15.4, + "step": 2538, + "task_loss": 0.212115079164505 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.13834933105813996, + "compression/movement_sparsity/importance_threshold": -0.006034772792392022, + "compression/movement_sparsity/linear_layer_sparsity": 0.02788231695544139, + "compression/movement_sparsity/model_sparsity": 0.026924473591982124, + "compression_loss": 14.925821304321289, + "distillation_loss": 0.4889252781867981, + "epoch": 2.15, + "learning_rate": 4.36320090166244e-05, + "loss": 15.51, + "step": 2539, + "task_loss": 0.5091556906700134 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1391138835148108, + "compression/movement_sparsity/importance_threshold": -0.006029418069729832, + "compression/movement_sparsity/linear_layer_sparsity": 0.028337164329911384, + "compression/movement_sparsity/model_sparsity": 0.027363695559864884, + "compression_loss": 15.008112907409668, + "distillation_loss": 0.357362300157547, + "epoch": 2.15, + "learning_rate": 4.362731285808209e-05, + "loss": 15.4813, + "step": 2540, + "task_loss": 0.11102087050676346 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1398779835735915, + "compression/movement_sparsity/importance_threshold": -0.006024066515542498, + "compression/movement_sparsity/linear_layer_sparsity": 0.028814882257906963, + "compression/movement_sparsity/model_sparsity": 0.027825002407401572, + "compression_loss": 15.090338706970215, + "distillation_loss": 0.8271412253379822, + "epoch": 2.15, + "learning_rate": 4.362261669953978e-05, + "loss": 15.616, + "step": 2541, + "task_loss": 1.4883544445037842 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.14064163136836716, + "compression/movement_sparsity/importance_threshold": -0.006018718128892324, + "compression/movement_sparsity/linear_layer_sparsity": 0.029297572563806697, + "compression/movement_sparsity/model_sparsity": 0.02829111081636458, + "compression_loss": 15.172521591186523, + "distillation_loss": 0.6153432130813599, + "epoch": 2.15, + "learning_rate": 4.3617920540997466e-05, + "loss": 15.7125, + "step": 2542, + "task_loss": 1.3548022508621216 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.14140482703302293, + "compression/movement_sparsity/importance_threshold": -0.006013372908841613, + "compression/movement_sparsity/linear_layer_sparsity": 0.029763676352524946, + "compression/movement_sparsity/model_sparsity": 0.02874120250603739, + "compression_loss": 15.254646301269531, + "distillation_loss": 0.6039646863937378, + "epoch": 2.15, + "learning_rate": 4.361322438245515e-05, + "loss": 15.777, + "step": 2543, + "task_loss": 0.4094589948654175 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.14216757070144403, + "compression/movement_sparsity/importance_threshold": -0.00600803085445267, + "compression/movement_sparsity/linear_layer_sparsity": 0.030172913785787767, + "compression/movement_sparsity/model_sparsity": 0.029136381374506026, + "compression_loss": 15.336736679077148, + "distillation_loss": 0.49554261565208435, + "epoch": 2.15, + "learning_rate": 4.3608528223912845e-05, + "loss": 15.8452, + "step": 2544, + "task_loss": 0.5987598896026611 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.14292986250751516, + "compression/movement_sparsity/importance_threshold": -0.006002691964787801, + "compression/movement_sparsity/linear_layer_sparsity": 0.030653684300698127, + "compression/movement_sparsity/model_sparsity": 0.029600635943206118, + "compression_loss": 15.418764114379883, + "distillation_loss": 0.8125389814376831, + "epoch": 2.15, + "learning_rate": 4.360383206537053e-05, + "loss": 16.0127, + "step": 2545, + "task_loss": 0.8768194913864136 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1436917025851222, + "compression/movement_sparsity/importance_threshold": -0.005997356238909304, + "compression/movement_sparsity/linear_layer_sparsity": 0.03125201518433046, + "compression/movement_sparsity/model_sparsity": 0.030178412320304587, + "compression_loss": 15.500741958618164, + "distillation_loss": 0.37669795751571655, + "epoch": 2.15, + "learning_rate": 4.359913590682821e-05, + "loss": 15.9534, + "step": 2546, + "task_loss": 0.6257323622703552 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.14445309106814996, + "compression/movement_sparsity/importance_threshold": -0.005992023675879487, + "compression/movement_sparsity/linear_layer_sparsity": 0.03176178527493124, + "compression/movement_sparsity/model_sparsity": 0.030670670240057, + "compression_loss": 15.582672119140625, + "distillation_loss": 0.4324830174446106, + "epoch": 2.15, + "learning_rate": 4.3594439748285904e-05, + "loss": 15.983, + "step": 2547, + "task_loss": 0.44929239153862 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.14521402809048345, + "compression/movement_sparsity/importance_threshold": -0.005986694274760654, + "compression/movement_sparsity/linear_layer_sparsity": 0.03231430350650659, + "compression/movement_sparsity/model_sparsity": 0.031204207770632843, + "compression_loss": 15.664546966552734, + "distillation_loss": 0.27873530983924866, + "epoch": 2.15, + "learning_rate": 4.358974358974359e-05, + "loss": 16.1197, + "step": 2548, + "task_loss": 0.6413927674293518 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.145974513786008, + "compression/movement_sparsity/importance_threshold": -0.0059813680346151085, + "compression/movement_sparsity/linear_layer_sparsity": 0.032713250383099655, + "compression/movement_sparsity/model_sparsity": 0.03158944959471079, + "compression_loss": 15.746368408203125, + "distillation_loss": 0.44594377279281616, + "epoch": 2.15, + "learning_rate": 4.3585047431201284e-05, + "loss": 16.222, + "step": 2549, + "task_loss": 0.8348119258880615 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.14673454828860866, + "compression/movement_sparsity/importance_threshold": -0.0059760449545051535, + "compression/movement_sparsity/linear_layer_sparsity": 0.03328564620212399, + "compression/movement_sparsity/model_sparsity": 0.032142181856456134, + "compression_loss": 15.828140258789062, + "distillation_loss": 0.667431652545929, + "epoch": 2.16, + "learning_rate": 4.3580351272658963e-05, + "loss": 16.373, + "step": 2550, + "task_loss": 1.9103401899337769 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.14749413173217052, + "compression/movement_sparsity/importance_threshold": -0.005970725033493094, + "compression/movement_sparsity/linear_layer_sparsity": 0.03381852532960307, + "compression/movement_sparsity/model_sparsity": 0.03265675494657837, + "compression_loss": 15.909868240356445, + "distillation_loss": 0.4898093342781067, + "epoch": 2.16, + "learning_rate": 4.3575655114116657e-05, + "loss": 16.3311, + "step": 2551, + "task_loss": 0.4720446765422821 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.14825326425057883, + "compression/movement_sparsity/importance_threshold": -0.0059654082706412336, + "compression/movement_sparsity/linear_layer_sparsity": 0.03424742571529743, + "compression/movement_sparsity/model_sparsity": 0.03307092128457221, + "compression_loss": 15.991547584533691, + "distillation_loss": 0.31136834621429443, + "epoch": 2.16, + "learning_rate": 4.357095895557434e-05, + "loss": 16.537, + "step": 2552, + "task_loss": 0.21877436339855194 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1490119459777186, + "compression/movement_sparsity/importance_threshold": -0.005960094665011878, + "compression/movement_sparsity/linear_layer_sparsity": 0.034558789580605084, + "compression/movement_sparsity/model_sparsity": 0.03337158884323926, + "compression_loss": 16.073169708251953, + "distillation_loss": 0.5418140888214111, + "epoch": 2.16, + "learning_rate": 4.356626279703203e-05, + "loss": 16.6177, + "step": 2553, + "task_loss": 0.9892523288726807 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.14977017704747486, + "compression/movement_sparsity/importance_threshold": -0.005954784215667331, + "compression/movement_sparsity/linear_layer_sparsity": 0.03509928825120348, + "compression/movement_sparsity/model_sparsity": 0.033893519721734204, + "compression_loss": 16.154754638671875, + "distillation_loss": 0.42275217175483704, + "epoch": 2.16, + "learning_rate": 4.3561566638489716e-05, + "loss": 16.628, + "step": 2554, + "task_loss": 0.8480641841888428 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.15052795759373316, + "compression/movement_sparsity/importance_threshold": -0.005949476921669892, + "compression/movement_sparsity/linear_layer_sparsity": 0.03564878966836696, + "compression/movement_sparsity/model_sparsity": 0.034424144074754026, + "compression_loss": 16.236284255981445, + "distillation_loss": 0.2954394221305847, + "epoch": 2.16, + "learning_rate": 4.35568704799474e-05, + "loss": 16.7439, + "step": 2555, + "task_loss": 0.40342456102371216 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.15128528775037842, + "compression/movement_sparsity/importance_threshold": -0.005944172782081869, + "compression/movement_sparsity/linear_layer_sparsity": 0.0360580032532945, + "compression/movement_sparsity/model_sparsity": 0.03481929991415107, + "compression_loss": 16.317771911621094, + "distillation_loss": 0.6737840175628662, + "epoch": 2.16, + "learning_rate": 4.3552174321405095e-05, + "loss": 16.9437, + "step": 2556, + "task_loss": 1.033182978630066 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.15204216765129575, + "compression/movement_sparsity/importance_threshold": -0.005938871795965566, + "compression/movement_sparsity/linear_layer_sparsity": 0.036564911543662676, + "compression/movement_sparsity/model_sparsity": 0.0353087943453128, + "compression_loss": 16.399198532104492, + "distillation_loss": 0.5425746440887451, + "epoch": 2.16, + "learning_rate": 4.354747816286278e-05, + "loss": 17.0122, + "step": 2557, + "task_loss": 2.0958335399627686 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1527985974303704, + "compression/movement_sparsity/importance_threshold": -0.005933573962383285, + "compression/movement_sparsity/linear_layer_sparsity": 0.03700468677024094, + "compression/movement_sparsity/model_sparsity": 0.03573346193995126, + "compression_loss": 16.48058319091797, + "distillation_loss": 0.5978001952171326, + "epoch": 2.16, + "learning_rate": 4.354278200432047e-05, + "loss": 16.9338, + "step": 2558, + "task_loss": 0.5980600118637085 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.15355457722148735, + "compression/movement_sparsity/importance_threshold": -0.005928279280397332, + "compression/movement_sparsity/linear_layer_sparsity": 0.037483942915861546, + "compression/movement_sparsity/model_sparsity": 0.03619625416260544, + "compression_loss": 16.561906814575195, + "distillation_loss": 0.6397451162338257, + "epoch": 2.16, + "learning_rate": 4.3538085845778154e-05, + "loss": 17.0903, + "step": 2559, + "task_loss": 1.148945689201355 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.15431010715853188, + "compression/movement_sparsity/importance_threshold": -0.00592298774907001, + "compression/movement_sparsity/linear_layer_sparsity": 0.037963199061482156, + "compression/movement_sparsity/model_sparsity": 0.036659046385259624, + "compression_loss": 16.643184661865234, + "distillation_loss": 0.5077143311500549, + "epoch": 2.16, + "learning_rate": 4.353338968723584e-05, + "loss": 17.2154, + "step": 2560, + "task_loss": 0.617636501789093 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1550651873753891, + "compression/movement_sparsity/importance_threshold": -0.005917699367463623, + "compression/movement_sparsity/linear_layer_sparsity": 0.03848062446770516, + "compression/movement_sparsity/model_sparsity": 0.03715869663699214, + "compression_loss": 16.724411010742188, + "distillation_loss": 0.456114262342453, + "epoch": 2.16, + "learning_rate": 4.3528693528693534e-05, + "loss": 17.2304, + "step": 2561, + "task_loss": 0.4709998369216919 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.15581981800594402, + "compression/movement_sparsity/importance_threshold": -0.005912414134640475, + "compression/movement_sparsity/linear_layer_sparsity": 0.03902245864507877, + "compression/movement_sparsity/model_sparsity": 0.03768191714349607, + "compression_loss": 16.805585861206055, + "distillation_loss": 0.8539761304855347, + "epoch": 2.17, + "learning_rate": 4.352399737015122e-05, + "loss": 17.4272, + "step": 2562, + "task_loss": 1.3193838596343994 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.156573999184082, + "compression/movement_sparsity/importance_threshold": -0.00590713204966287, + "compression/movement_sparsity/linear_layer_sparsity": 0.039566021826759584, + "compression/movement_sparsity/model_sparsity": 0.03820680725769021, + "compression_loss": 16.886709213256836, + "distillation_loss": 0.38395893573760986, + "epoch": 2.17, + "learning_rate": 4.3519301211608906e-05, + "loss": 17.2919, + "step": 2563, + "task_loss": 0.22457440197467804 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1573277310436879, + "compression/movement_sparsity/importance_threshold": -0.0059018531115931135, + "compression/movement_sparsity/linear_layer_sparsity": 0.0401612881993095, + "compression/movement_sparsity/model_sparsity": 0.038781624399089484, + "compression_loss": 16.967763900756836, + "distillation_loss": 0.4609811305999756, + "epoch": 2.17, + "learning_rate": 4.351460505306659e-05, + "loss": 17.406, + "step": 2564, + "task_loss": 0.43393149971961975 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1580810137186469, + "compression/movement_sparsity/importance_threshold": -0.005896577319493508, + "compression/movement_sparsity/linear_layer_sparsity": 0.04067735425042202, + "compression/movement_sparsity/model_sparsity": 0.03927996199374141, + "compression_loss": 17.04880142211914, + "distillation_loss": 0.30530282855033875, + "epoch": 2.17, + "learning_rate": 4.350990889452428e-05, + "loss": 17.5863, + "step": 2565, + "task_loss": 0.04473242163658142 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.15883384734284445, + "compression/movement_sparsity/importance_threshold": -0.005891304672426357, + "compression/movement_sparsity/linear_layer_sparsity": 0.04123901831857407, + "compression/movement_sparsity/model_sparsity": 0.03982233117327167, + "compression_loss": 17.129777908325195, + "distillation_loss": 0.28446143865585327, + "epoch": 2.17, + "learning_rate": 4.350521273598197e-05, + "loss": 17.538, + "step": 2566, + "task_loss": 0.4693899154663086 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.15958623205016553, + "compression/movement_sparsity/importance_threshold": -0.005886035169453964, + "compression/movement_sparsity/linear_layer_sparsity": 0.04175643180062944, + "compression/movement_sparsity/model_sparsity": 0.04032196991046838, + "compression_loss": 17.2106990814209, + "distillation_loss": 0.34160158038139343, + "epoch": 2.17, + "learning_rate": 4.350051657743965e-05, + "loss": 17.6081, + "step": 2567, + "task_loss": 0.2820066511631012 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1603381679744953, + "compression/movement_sparsity/importance_threshold": -0.0058807688096386345, + "compression/movement_sparsity/linear_layer_sparsity": 0.042417316867679515, + "compression/movement_sparsity/model_sparsity": 0.04096015154234501, + "compression_loss": 17.291553497314453, + "distillation_loss": 0.4809970259666443, + "epoch": 2.17, + "learning_rate": 4.3495820418897345e-05, + "loss": 17.7206, + "step": 2568, + "task_loss": 0.4056711792945862 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.16108965524971874, + "compression/movement_sparsity/importance_threshold": -0.005875505592042673, + "compression/movement_sparsity/linear_layer_sparsity": 0.04317760179614214, + "compression/movement_sparsity/model_sparsity": 0.04169431834460493, + "compression_loss": 17.37234115600586, + "distillation_loss": 0.6096224784851074, + "epoch": 2.17, + "learning_rate": 4.349112426035503e-05, + "loss": 17.9191, + "step": 2569, + "task_loss": 1.2472753524780273 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.16184069400972123, + "compression/movement_sparsity/importance_threshold": -0.005870245515728381, + "compression/movement_sparsity/linear_layer_sparsity": 0.0437118045062288, + "compression/movement_sparsity/model_sparsity": 0.04221016954820036, + "compression_loss": 17.453109741210938, + "distillation_loss": 0.271640419960022, + "epoch": 2.17, + "learning_rate": 4.348642810181272e-05, + "loss": 17.9301, + "step": 2570, + "task_loss": 1.587713599205017 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.16259128438838777, + "compression/movement_sparsity/importance_threshold": -0.005864988579758064, + "compression/movement_sparsity/linear_layer_sparsity": 0.04435132146487541, + "compression/movement_sparsity/model_sparsity": 0.042827717131933174, + "compression_loss": 17.5338134765625, + "distillation_loss": 0.4093632698059082, + "epoch": 2.17, + "learning_rate": 4.348173194327041e-05, + "loss": 18.0738, + "step": 2571, + "task_loss": 0.5096733570098877 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.16334142651960348, + "compression/movement_sparsity/importance_threshold": -0.005859734783194026, + "compression/movement_sparsity/linear_layer_sparsity": 0.04491910263102466, + "compression/movement_sparsity/model_sparsity": 0.04337599326832602, + "compression_loss": 17.61447525024414, + "distillation_loss": 0.6001753807067871, + "epoch": 2.17, + "learning_rate": 4.347703578472809e-05, + "loss": 18.082, + "step": 2572, + "task_loss": 0.907132089138031 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1640911205372535, + "compression/movement_sparsity/importance_threshold": -0.005854484125098572, + "compression/movement_sparsity/linear_layer_sparsity": 0.045450455465046356, + "compression/movement_sparsity/model_sparsity": 0.04388909249786656, + "compression_loss": 17.695087432861328, + "distillation_loss": 0.4726676940917969, + "epoch": 2.17, + "learning_rate": 4.347233962618578e-05, + "loss": 18.1939, + "step": 2573, + "task_loss": 1.0323283672332764 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.16484036657522272, + "compression/movement_sparsity/importance_threshold": -0.005849236604534006, + "compression/movement_sparsity/linear_layer_sparsity": 0.04603825730865622, + "compression/movement_sparsity/model_sparsity": 0.044456701539858444, + "compression_loss": 17.775634765625, + "distillation_loss": 0.36013925075531006, + "epoch": 2.18, + "learning_rate": 4.346764346764347e-05, + "loss": 18.3455, + "step": 2574, + "task_loss": 0.7354448437690735 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.16558916476739693, + "compression/movement_sparsity/importance_threshold": -0.005843992220562629, + "compression/movement_sparsity/linear_layer_sparsity": 0.04644158035477165, + "compression/movement_sparsity/model_sparsity": 0.044846169198573, + "compression_loss": 17.856159210205078, + "distillation_loss": 0.6834437251091003, + "epoch": 2.18, + "learning_rate": 4.346294730910116e-05, + "loss": 18.4358, + "step": 2575, + "task_loss": 0.36337849497795105 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.16633751524766094, + "compression/movement_sparsity/importance_threshold": -0.005838750972246746, + "compression/movement_sparsity/linear_layer_sparsity": 0.04698495274977029, + "compression/movement_sparsity/model_sparsity": 0.045370875080194425, + "compression_loss": 17.936613082885742, + "distillation_loss": 0.6196240186691284, + "epoch": 2.18, + "learning_rate": 4.345825115055884e-05, + "loss": 18.5703, + "step": 2576, + "task_loss": 0.7004544138908386 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.16708541814989974, + "compression/movement_sparsity/importance_threshold": -0.005833512858648663, + "compression/movement_sparsity/linear_layer_sparsity": 0.04758634814448504, + "compression/movement_sparsity/model_sparsity": 0.04595161069299209, + "compression_loss": 18.01702880859375, + "distillation_loss": 0.37560123205184937, + "epoch": 2.18, + "learning_rate": 4.345355499201653e-05, + "loss": 18.5944, + "step": 2577, + "task_loss": 0.34965047240257263 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1678328736079987, + "compression/movement_sparsity/importance_threshold": -0.005828277878830682, + "compression/movement_sparsity/linear_layer_sparsity": 0.04800149996489525, + "compression/movement_sparsity/model_sparsity": 0.04635250077121482, + "compression_loss": 18.097414016723633, + "distillation_loss": 0.6937552094459534, + "epoch": 2.18, + "learning_rate": 4.344885883347422e-05, + "loss": 18.7356, + "step": 2578, + "task_loss": 2.303344488143921 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.16857988175584282, + "compression/movement_sparsity/importance_threshold": -0.005823046031855108, + "compression/movement_sparsity/linear_layer_sparsity": 0.04865323919536862, + "compression/movement_sparsity/model_sparsity": 0.046981850754137036, + "compression_loss": 18.1777286529541, + "distillation_loss": 0.4613959789276123, + "epoch": 2.18, + "learning_rate": 4.344416267493191e-05, + "loss": 18.842, + "step": 2579, + "task_loss": 0.3624493479728699 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.16932644272731723, + "compression/movement_sparsity/importance_threshold": -0.005817817316784245, + "compression/movement_sparsity/linear_layer_sparsity": 0.04910521284543837, + "compression/movement_sparsity/model_sparsity": 0.04741829771889331, + "compression_loss": 18.257991790771484, + "distillation_loss": 0.6476542949676514, + "epoch": 2.18, + "learning_rate": 4.3439466516389594e-05, + "loss": 18.9013, + "step": 2580, + "task_loss": 0.4365416169166565 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.17007255665630716, + "compression/movement_sparsity/importance_threshold": -0.005812591732680396, + "compression/movement_sparsity/linear_layer_sparsity": 0.04966382432667564, + "compression/movement_sparsity/model_sparsity": 0.04795771917726017, + "compression_loss": 18.338224411010742, + "distillation_loss": 0.45096755027770996, + "epoch": 2.18, + "learning_rate": 4.343477035784728e-05, + "loss": 18.7769, + "step": 2581, + "task_loss": 0.5146486759185791 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.17081822367669763, + "compression/movement_sparsity/importance_threshold": -0.005807369278605866, + "compression/movement_sparsity/linear_layer_sparsity": 0.05026387229044754, + "compression/movement_sparsity/model_sparsity": 0.04853715364751305, + "compression_loss": 18.418376922607422, + "distillation_loss": 0.44547998905181885, + "epoch": 2.18, + "learning_rate": 4.3430074199304974e-05, + "loss": 18.9035, + "step": 2582, + "task_loss": 0.38143670558929443 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.17156344392237388, + "compression/movement_sparsity/importance_threshold": -0.00580214995362296, + "compression/movement_sparsity/linear_layer_sparsity": 0.050747719240607954, + "compression/movement_sparsity/model_sparsity": 0.04900437896644813, + "compression_loss": 18.498502731323242, + "distillation_loss": 0.5979600548744202, + "epoch": 2.18, + "learning_rate": 4.342537804076266e-05, + "loss": 19.0264, + "step": 2583, + "task_loss": 0.7864080667495728 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.17230821752722114, + "compression/movement_sparsity/importance_threshold": -0.005796933756793979, + "compression/movement_sparsity/linear_layer_sparsity": 0.05135081979129463, + "compression/movement_sparsity/model_sparsity": 0.04958676115786441, + "compression_loss": 18.578575134277344, + "distillation_loss": 1.04672372341156, + "epoch": 2.18, + "learning_rate": 4.3420681882220346e-05, + "loss": 19.2967, + "step": 2584, + "task_loss": 0.7037136554718018 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.17305254462512398, + "compression/movement_sparsity/importance_threshold": -0.005791720687181231, + "compression/movement_sparsity/linear_layer_sparsity": 0.05212007169381943, + "compression/movement_sparsity/model_sparsity": 0.050329586891041826, + "compression_loss": 18.65860366821289, + "distillation_loss": 0.44890037178993225, + "epoch": 2.19, + "learning_rate": 4.341598572367803e-05, + "loss": 19.2456, + "step": 2585, + "task_loss": 0.363080769777298 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.17379642534996842, + "compression/movement_sparsity/importance_threshold": -0.005786510743847016, + "compression/movement_sparsity/linear_layer_sparsity": 0.05253063270968983, + "compression/movement_sparsity/model_sparsity": 0.05072604387298366, + "compression_loss": 18.738582611083984, + "distillation_loss": 0.5087771415710449, + "epoch": 2.19, + "learning_rate": 4.341128956513572e-05, + "loss": 19.3771, + "step": 2586, + "task_loss": 0.8138629198074341 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.17453985983563913, + "compression/movement_sparsity/importance_threshold": -0.005781303925853639, + "compression/movement_sparsity/linear_layer_sparsity": 0.05314422652789606, + "compression/movement_sparsity/model_sparsity": 0.05131855885589914, + "compression_loss": 18.818496704101562, + "distillation_loss": 0.5859801173210144, + "epoch": 2.19, + "learning_rate": 4.340659340659341e-05, + "loss": 19.3493, + "step": 2587, + "task_loss": 0.3020893335342407 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.17528284821602125, + "compression/movement_sparsity/importance_threshold": -0.005776100232263406, + "compression/movement_sparsity/linear_layer_sparsity": 0.05377917653033813, + "compression/movement_sparsity/model_sparsity": 0.05193169637242265, + "compression_loss": 18.898366928100586, + "distillation_loss": 0.43857818841934204, + "epoch": 2.19, + "learning_rate": 4.34018972480511e-05, + "loss": 19.4406, + "step": 2588, + "task_loss": 1.233670711517334 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.176025390625, + "compression/movement_sparsity/importance_threshold": -0.005770899662138618, + "compression/movement_sparsity/linear_layer_sparsity": 0.05433188554859566, + "compression/movement_sparsity/model_sparsity": 0.0524654181355712, + "compression_loss": 18.97818946838379, + "distillation_loss": 0.6065633296966553, + "epoch": 2.19, + "learning_rate": 4.3397201089508785e-05, + "loss": 19.3978, + "step": 2589, + "task_loss": 0.9234592318534851 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.17676748719646052, + "compression/movement_sparsity/importance_threshold": -0.005765702214541581, + "compression/movement_sparsity/linear_layer_sparsity": 0.0549243020450806, + "compression/movement_sparsity/model_sparsity": 0.05303748330291558, + "compression_loss": 19.05794906616211, + "distillation_loss": 0.9864470958709717, + "epoch": 2.19, + "learning_rate": 4.339250493096647e-05, + "loss": 19.7077, + "step": 2590, + "task_loss": 1.4414234161376953 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.17750913806428792, + "compression/movement_sparsity/importance_threshold": -0.005760507888534598, + "compression/movement_sparsity/linear_layer_sparsity": 0.055408339781923194, + "compression/movement_sparsity/model_sparsity": 0.05350489285442337, + "compression_loss": 19.137659072875977, + "distillation_loss": 0.574617862701416, + "epoch": 2.19, + "learning_rate": 4.338780877242416e-05, + "loss": 19.6646, + "step": 2591, + "task_loss": 0.9419476389884949 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.17825034336236723, + "compression/movement_sparsity/importance_threshold": -0.005755316683179974, + "compression/movement_sparsity/linear_layer_sparsity": 0.05591201662286204, + "compression/movement_sparsity/model_sparsity": 0.05399126684638477, + "compression_loss": 19.217308044433594, + "distillation_loss": 0.49007704854011536, + "epoch": 2.19, + "learning_rate": 4.338311261388185e-05, + "loss": 19.6607, + "step": 2592, + "task_loss": 0.2581959664821625 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.17899110322458378, + "compression/movement_sparsity/importance_threshold": -0.0057501285975400124, + "compression/movement_sparsity/linear_layer_sparsity": 0.05661430239994383, + "compression/movement_sparsity/model_sparsity": 0.05466942694654005, + "compression_loss": 19.296913146972656, + "distillation_loss": 0.5571882724761963, + "epoch": 2.19, + "learning_rate": 4.337841645533953e-05, + "loss": 19.7352, + "step": 2593, + "task_loss": 0.2809939682483673 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.17973141778482238, + "compression/movement_sparsity/importance_threshold": -0.00574494363067702, + "compression/movement_sparsity/linear_layer_sparsity": 0.057288936032278054, + "compression/movement_sparsity/model_sparsity": 0.055320884838187784, + "compression_loss": 19.376455307006836, + "distillation_loss": 0.4118906855583191, + "epoch": 2.19, + "learning_rate": 4.3373720296797223e-05, + "loss": 19.8222, + "step": 2594, + "task_loss": 0.42496564984321594 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1804712871769687, + "compression/movement_sparsity/importance_threshold": -0.0057397617816532945, + "compression/movement_sparsity/linear_layer_sparsity": 0.05779912346874609, + "compression/movement_sparsity/model_sparsity": 0.05581354576669301, + "compression_loss": 19.455936431884766, + "distillation_loss": 0.7936160564422607, + "epoch": 2.19, + "learning_rate": 4.336902413825491e-05, + "loss": 20.0499, + "step": 2595, + "task_loss": 1.0827429294586182 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.18121071153490753, + "compression/movement_sparsity/importance_threshold": -0.0057345830495311445, + "compression/movement_sparsity/linear_layer_sparsity": 0.05847373325274504, + "compression/movement_sparsity/model_sparsity": 0.05646498062926915, + "compression_loss": 19.535396575927734, + "distillation_loss": 0.4885663688182831, + "epoch": 2.19, + "learning_rate": 4.3364327979712596e-05, + "loss": 20.1246, + "step": 2596, + "task_loss": 1.4198226928710938 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.18194969099252412, + "compression/movement_sparsity/importance_threshold": -0.005729407433372873, + "compression/movement_sparsity/linear_layer_sparsity": 0.059038700315332225, + "compression/movement_sparsity/model_sparsity": 0.057010539335214494, + "compression_loss": 19.61480140686035, + "distillation_loss": 0.604716956615448, + "epoch": 2.2, + "learning_rate": 4.335963182117028e-05, + "loss": 20.1506, + "step": 2597, + "task_loss": 0.42257821559906006 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.18268822568370358, + "compression/movement_sparsity/importance_threshold": -0.005724234932240784, + "compression/movement_sparsity/linear_layer_sparsity": 0.059786596033621194, + "compression/movement_sparsity/model_sparsity": 0.05773274253478389, + "compression_loss": 19.69416046142578, + "distillation_loss": 0.38764089345932007, + "epoch": 2.2, + "learning_rate": 4.335493566262797e-05, + "loss": 20.203, + "step": 2598, + "task_loss": 0.7139275074005127 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.18342631574233104, + "compression/movement_sparsity/importance_threshold": -0.0057190655451971805, + "compression/movement_sparsity/linear_layer_sparsity": 0.06040780939494675, + "compression/movement_sparsity/model_sparsity": 0.05833261530607208, + "compression_loss": 19.773475646972656, + "distillation_loss": 0.687164306640625, + "epoch": 2.2, + "learning_rate": 4.335023950408566e-05, + "loss": 20.3461, + "step": 2599, + "task_loss": 1.1853910684585571 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.18416396130229173, + "compression/movement_sparsity/importance_threshold": -0.005713899271304367, + "compression/movement_sparsity/linear_layer_sparsity": 0.060990853495669906, + "compression/movement_sparsity/model_sparsity": 0.058895630048281945, + "compression_loss": 19.85274314880371, + "distillation_loss": 0.46769702434539795, + "epoch": 2.2, + "learning_rate": 4.334554334554335e-05, + "loss": 20.2885, + "step": 2600, + "task_loss": 0.5585185289382935 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.18490116249747057, + "compression/movement_sparsity/importance_threshold": -0.005708736109624649, + "compression/movement_sparsity/linear_layer_sparsity": 0.06148233191311727, + "compression/movement_sparsity/model_sparsity": 0.05937022467012553, + "compression_loss": 19.931962966918945, + "distillation_loss": 0.3417395055294037, + "epoch": 2.2, + "learning_rate": 4.334084718700104e-05, + "loss": 20.5222, + "step": 2601, + "task_loss": 1.5005073547363281 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1856379194617529, + "compression/movement_sparsity/importance_threshold": -0.005703576059220328, + "compression/movement_sparsity/linear_layer_sparsity": 0.06209763088729544, + "compression/movement_sparsity/model_sparsity": 0.05996438623165963, + "compression_loss": 20.011117935180664, + "distillation_loss": 1.0206444263458252, + "epoch": 2.2, + "learning_rate": 4.333615102845872e-05, + "loss": 20.757, + "step": 2602, + "task_loss": 0.6419742107391357 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.18637423232902384, + "compression/movement_sparsity/importance_threshold": -0.00569841911915371, + "compression/movement_sparsity/linear_layer_sparsity": 0.06260607739528863, + "compression/movement_sparsity/model_sparsity": 0.06045536603793885, + "compression_loss": 20.090251922607422, + "distillation_loss": 0.5924590229988098, + "epoch": 2.2, + "learning_rate": 4.333145486991641e-05, + "loss": 20.6306, + "step": 2603, + "task_loss": 0.24952159821987152 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1871101012331684, + "compression/movement_sparsity/importance_threshold": -0.005693265288487098, + "compression/movement_sparsity/linear_layer_sparsity": 0.06316471272486118, + "compression/movement_sparsity/model_sparsity": 0.060994810525377294, + "compression_loss": 20.1693115234375, + "distillation_loss": 0.450995534658432, + "epoch": 2.2, + "learning_rate": 4.33267587113741e-05, + "loss": 20.6548, + "step": 2604, + "task_loss": 0.3976738154888153 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1878455263080716, + "compression/movement_sparsity/importance_threshold": -0.005688114566282799, + "compression/movement_sparsity/linear_layer_sparsity": 0.06379965080313561, + "compression/movement_sparsity/model_sparsity": 0.061607936527365, + "compression_loss": 20.248327255249023, + "distillation_loss": 0.6952104568481445, + "epoch": 2.2, + "learning_rate": 4.332206255283179e-05, + "loss": 20.7686, + "step": 2605, + "task_loss": 0.7780268788337708 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.18858050768761903, + "compression/movement_sparsity/importance_threshold": -0.005682966951603112, + "compression/movement_sparsity/linear_layer_sparsity": 0.06444392550466893, + "compression/movement_sparsity/model_sparsity": 0.06223007841087984, + "compression_loss": 20.327301025390625, + "distillation_loss": 0.5592168569564819, + "epoch": 2.2, + "learning_rate": 4.331736639428947e-05, + "loss": 20.7604, + "step": 2606, + "task_loss": 0.8684561848640442 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.18931504550569556, + "compression/movement_sparsity/importance_threshold": -0.005677822443510342, + "compression/movement_sparsity/linear_layer_sparsity": 0.06513705352100638, + "compression/movement_sparsity/model_sparsity": 0.0628993953475449, + "compression_loss": 20.40620231628418, + "distillation_loss": 0.3979378342628479, + "epoch": 2.2, + "learning_rate": 4.331267023574716e-05, + "loss": 20.8494, + "step": 2607, + "task_loss": 0.146898552775383 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.19004913989618644, + "compression/movement_sparsity/importance_threshold": -0.005672681041066795, + "compression/movement_sparsity/linear_layer_sparsity": 0.06587883214129814, + "compression/movement_sparsity/model_sparsity": 0.0636156915902517, + "compression_loss": 20.48504638671875, + "distillation_loss": 0.8707166910171509, + "epoch": 2.2, + "learning_rate": 4.330797407720485e-05, + "loss": 21.0762, + "step": 2608, + "task_loss": 0.693575918674469 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.19078279099297668, + "compression/movement_sparsity/importance_threshold": -0.005667542743334773, + "compression/movement_sparsity/linear_layer_sparsity": 0.06658400356694781, + "compression/movement_sparsity/model_sparsity": 0.06429663820806925, + "compression_loss": 20.563831329345703, + "distillation_loss": 0.519855260848999, + "epoch": 2.21, + "learning_rate": 4.330327791866254e-05, + "loss": 21.0682, + "step": 2609, + "task_loss": 0.7062942385673523 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.19151599892995141, + "compression/movement_sparsity/importance_threshold": -0.005662407549376584, + "compression/movement_sparsity/linear_layer_sparsity": 0.06720691016007765, + "compression/movement_sparsity/model_sparsity": 0.06489814604344027, + "compression_loss": 20.6425838470459, + "distillation_loss": 0.28868207335472107, + "epoch": 2.21, + "learning_rate": 4.329858176012022e-05, + "loss": 20.985, + "step": 2610, + "task_loss": 0.3222710192203522 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.19224876384099587, + "compression/movement_sparsity/importance_threshold": -0.005657275458254526, + "compression/movement_sparsity/linear_layer_sparsity": 0.06787564132943212, + "compression/movement_sparsity/model_sparsity": 0.06554390423986971, + "compression_loss": 20.721256256103516, + "distillation_loss": 0.4006032943725586, + "epoch": 2.21, + "learning_rate": 4.329388560157791e-05, + "loss": 21.3493, + "step": 2611, + "task_loss": 1.075363039970398 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.19298108585999518, + "compression/movement_sparsity/importance_threshold": -0.005652146469030908, + "compression/movement_sparsity/linear_layer_sparsity": 0.06846785511506727, + "compression/movement_sparsity/model_sparsity": 0.06611577366010557, + "compression_loss": 20.799875259399414, + "distillation_loss": 0.7120269536972046, + "epoch": 2.21, + "learning_rate": 4.32891894430356e-05, + "loss": 21.5071, + "step": 2612, + "task_loss": 0.5701888799667358 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.19371296512083447, + "compression/movement_sparsity/importance_threshold": -0.005647020580768032, + "compression/movement_sparsity/linear_layer_sparsity": 0.06924018345284211, + "compression/movement_sparsity/model_sparsity": 0.06686157014351798, + "compression_loss": 20.878467559814453, + "distillation_loss": 0.6383981704711914, + "epoch": 2.21, + "learning_rate": 4.328449328449329e-05, + "loss": 21.585, + "step": 2613, + "task_loss": 0.15748874843120575 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.19444440175739863, + "compression/movement_sparsity/importance_threshold": -0.005641897792528203, + "compression/movement_sparsity/linear_layer_sparsity": 0.06987075728576182, + "compression/movement_sparsity/model_sparsity": 0.06747048182540488, + "compression_loss": 20.957014083862305, + "distillation_loss": 0.5648521780967712, + "epoch": 2.21, + "learning_rate": 4.327979712595097e-05, + "loss": 21.6449, + "step": 2614, + "task_loss": 0.3372357487678528 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.19517539590357325, + "compression/movement_sparsity/importance_threshold": -0.005636778103373722, + "compression/movement_sparsity/linear_layer_sparsity": 0.07051791763586301, + "compression/movement_sparsity/model_sparsity": 0.068095410226582, + "compression_loss": 21.03551483154297, + "distillation_loss": 0.44243037700653076, + "epoch": 2.21, + "learning_rate": 4.3275100967408664e-05, + "loss": 21.6, + "step": 2615, + "task_loss": 0.5873730778694153 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.19590594769324332, + "compression/movement_sparsity/importance_threshold": -0.005631661512366895, + "compression/movement_sparsity/linear_layer_sparsity": 0.07102159447680187, + "compression/movement_sparsity/model_sparsity": 0.0685817842185434, + "compression_loss": 21.113969802856445, + "distillation_loss": 0.4282251298427582, + "epoch": 2.21, + "learning_rate": 4.327040480886635e-05, + "loss": 21.5722, + "step": 2616, + "task_loss": 0.8199256062507629 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.19663605726029387, + "compression/movement_sparsity/importance_threshold": -0.005626548018570027, + "compression/movement_sparsity/linear_layer_sparsity": 0.07159568352763049, + "compression/movement_sparsity/model_sparsity": 0.06913615154437157, + "compression_loss": 21.1923770904541, + "distillation_loss": 0.6075276136398315, + "epoch": 2.21, + "learning_rate": 4.3265708650324036e-05, + "loss": 21.7401, + "step": 2617, + "task_loss": 1.2810386419296265 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.19736572473861014, + "compression/movement_sparsity/importance_threshold": -0.00562143762104542, + "compression/movement_sparsity/linear_layer_sparsity": 0.07239851388621787, + "compression/movement_sparsity/model_sparsity": 0.06991140221034642, + "compression_loss": 21.27072525024414, + "distillation_loss": 0.6929813623428345, + "epoch": 2.21, + "learning_rate": 4.326101249178173e-05, + "loss": 21.9489, + "step": 2618, + "task_loss": 0.7044781446456909 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.19809495026207724, + "compression/movement_sparsity/importance_threshold": -0.005616330318855379, + "compression/movement_sparsity/linear_layer_sparsity": 0.07310977856152946, + "compression/movement_sparsity/model_sparsity": 0.07059823275595498, + "compression_loss": 21.349048614501953, + "distillation_loss": 0.7015190124511719, + "epoch": 2.21, + "learning_rate": 4.325631633323941e-05, + "loss": 21.966, + "step": 2619, + "task_loss": 0.6157370209693909 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.1988237339645803, + "compression/movement_sparsity/importance_threshold": -0.005611226111062208, + "compression/movement_sparsity/linear_layer_sparsity": 0.07374778115088632, + "compression/movement_sparsity/model_sparsity": 0.07121431799364188, + "compression_loss": 21.427309036254883, + "distillation_loss": 0.7665265798568726, + "epoch": 2.21, + "learning_rate": 4.32516201746971e-05, + "loss": 22.049, + "step": 2620, + "task_loss": 0.8627924919128418 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.19955207598000446, + "compression/movement_sparsity/importance_threshold": -0.00560612499672821, + "compression/movement_sparsity/linear_layer_sparsity": 0.07445904582619792, + "compression/movement_sparsity/model_sparsity": 0.07190114853925045, + "compression_loss": 21.505512237548828, + "distillation_loss": 0.7955713272094727, + "epoch": 2.22, + "learning_rate": 4.324692401615479e-05, + "loss": 22.1575, + "step": 2621, + "task_loss": 0.3291466236114502 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.20027997644223483, + "compression/movement_sparsity/importance_threshold": -0.005601026974915691, + "compression/movement_sparsity/linear_layer_sparsity": 0.07520103908150712, + "compression/movement_sparsity/model_sparsity": 0.07261765204360154, + "compression_loss": 21.58367347717285, + "distillation_loss": 0.3682970404624939, + "epoch": 2.22, + "learning_rate": 4.3242227857612475e-05, + "loss": 22.0635, + "step": 2622, + "task_loss": 0.9004453420639038 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.20100743548515654, + "compression/movement_sparsity/importance_threshold": -0.005595932044686953, + "compression/movement_sparsity/linear_layer_sparsity": 0.075643676108318, + "compression/movement_sparsity/model_sparsity": 0.0730450831268307, + "compression_loss": 21.66181182861328, + "distillation_loss": 0.625929057598114, + "epoch": 2.22, + "learning_rate": 4.323753169907016e-05, + "loss": 22.4386, + "step": 2623, + "task_loss": 1.2974079847335815 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.20173445324265482, + "compression/movement_sparsity/importance_threshold": -0.005590840205104302, + "compression/movement_sparsity/linear_layer_sparsity": 0.07625420541544181, + "compression/movement_sparsity/model_sparsity": 0.07363463887404698, + "compression_loss": 21.739891052246094, + "distillation_loss": 0.29105985164642334, + "epoch": 2.22, + "learning_rate": 4.323283554052785e-05, + "loss": 22.4226, + "step": 2624, + "task_loss": 0.2771998941898346 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.20246102984861447, + "compression/movement_sparsity/importance_threshold": -0.005585751455230042, + "compression/movement_sparsity/linear_layer_sparsity": 0.07685862954873608, + "compression/movement_sparsity/model_sparsity": 0.07421829917893645, + "compression_loss": 21.81793212890625, + "distillation_loss": 0.39496883749961853, + "epoch": 2.22, + "learning_rate": 4.322813938198554e-05, + "loss": 22.3592, + "step": 2625, + "task_loss": 0.7103185653686523 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.20318716543692117, + "compression/movement_sparsity/importance_threshold": -0.005580665794126475, + "compression/movement_sparsity/linear_layer_sparsity": 0.07767674669023264, + "compression/movement_sparsity/model_sparsity": 0.07500831147979989, + "compression_loss": 21.895919799804688, + "distillation_loss": 0.517465353012085, + "epoch": 2.22, + "learning_rate": 4.322344322344323e-05, + "loss": 22.4793, + "step": 2626, + "task_loss": 0.5246644616127014 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.20391286014145993, + "compression/movement_sparsity/importance_threshold": -0.005575583220855903, + "compression/movement_sparsity/linear_layer_sparsity": 0.07824758044329667, + "compression/movement_sparsity/model_sparsity": 0.07555953533735615, + "compression_loss": 21.973861694335938, + "distillation_loss": 0.35133275389671326, + "epoch": 2.22, + "learning_rate": 4.321874706490091e-05, + "loss": 22.4376, + "step": 2627, + "task_loss": 0.5254411697387695 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.20463811409611565, + "compression/movement_sparsity/importance_threshold": -0.005570503734480635, + "compression/movement_sparsity/linear_layer_sparsity": 0.07897277254640696, + "compression/movement_sparsity/model_sparsity": 0.07625981486077274, + "compression_loss": 22.051734924316406, + "distillation_loss": 0.5293159484863281, + "epoch": 2.22, + "learning_rate": 4.32140509063586e-05, + "loss": 22.5723, + "step": 2628, + "task_loss": 1.3881436586380005 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.20536292743477358, + "compression/movement_sparsity/importance_threshold": -0.005565427334062973, + "compression/movement_sparsity/linear_layer_sparsity": 0.07953750112564142, + "compression/movement_sparsity/model_sparsity": 0.07680514327600219, + "compression_loss": 22.129554748535156, + "distillation_loss": 0.42485934495925903, + "epoch": 2.22, + "learning_rate": 4.3209354747816286e-05, + "loss": 22.5856, + "step": 2629, + "task_loss": 0.7008118033409119 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.20608730029131894, + "compression/movement_sparsity/importance_threshold": -0.005560354018665219, + "compression/movement_sparsity/linear_layer_sparsity": 0.08024131319618061, + "compression/movement_sparsity/model_sparsity": 0.07748477723673916, + "compression_loss": 22.207319259643555, + "distillation_loss": 0.5170126557350159, + "epoch": 2.22, + "learning_rate": 4.320465858927398e-05, + "loss": 22.6649, + "step": 2630, + "task_loss": 0.18735742568969727 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.20681123279963676, + "compression/movement_sparsity/importance_threshold": -0.00555528378734968, + "compression/movement_sparsity/linear_layer_sparsity": 0.08094801091528767, + "compression/movement_sparsity/model_sparsity": 0.07816719771513843, + "compression_loss": 22.285032272338867, + "distillation_loss": 0.6301615238189697, + "epoch": 2.22, + "learning_rate": 4.319996243073166e-05, + "loss": 22.7945, + "step": 2631, + "task_loss": 0.7999787926673889 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.20753472509361226, + "compression/movement_sparsity/importance_threshold": -0.005550216639178658, + "compression/movement_sparsity/linear_layer_sparsity": 0.08151272757035449, + "compression/movement_sparsity/model_sparsity": 0.07871251461583208, + "compression_loss": 22.362688064575195, + "distillation_loss": 0.7996900081634521, + "epoch": 2.22, + "learning_rate": 4.319526627218935e-05, + "loss": 23.0076, + "step": 2632, + "task_loss": 1.283111572265625 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.20825777730713058, + "compression/movement_sparsity/importance_threshold": -0.005545152573214457, + "compression/movement_sparsity/linear_layer_sparsity": 0.08229135186864107, + "compression/movement_sparsity/model_sparsity": 0.07946439077414401, + "compression_loss": 22.440296173095703, + "distillation_loss": 0.43262556195259094, + "epoch": 2.23, + "learning_rate": 4.319057011364704e-05, + "loss": 22.9309, + "step": 2633, + "task_loss": 0.22137288749217987 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.2089803895740765, + "compression/movement_sparsity/importance_threshold": -0.005540091588519385, + "compression/movement_sparsity/linear_layer_sparsity": 0.08296444728335027, + "compression/movement_sparsity/model_sparsity": 0.08011436329067424, + "compression_loss": 22.517831802368164, + "distillation_loss": 0.7679336071014404, + "epoch": 2.23, + "learning_rate": 4.3185873955104724e-05, + "loss": 23.2776, + "step": 2634, + "task_loss": 0.5178093910217285 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.2097025620283357, + "compression/movement_sparsity/importance_threshold": -0.00553503368415574, + "compression/movement_sparsity/linear_layer_sparsity": 0.08357496466630646, + "compression/movement_sparsity/model_sparsity": 0.08070390752335473, + "compression_loss": 22.595335006713867, + "distillation_loss": 0.36907321214675903, + "epoch": 2.23, + "learning_rate": 4.318117779656242e-05, + "loss": 23.3036, + "step": 2635, + "task_loss": 1.0599133968353271 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.2104242948037932, + "compression/movement_sparsity/importance_threshold": -0.005529978859185828, + "compression/movement_sparsity/linear_layer_sparsity": 0.08416126406479421, + "compression/movement_sparsity/model_sparsity": 0.08127006573383651, + "compression_loss": 22.67280387878418, + "distillation_loss": 0.36618030071258545, + "epoch": 2.23, + "learning_rate": 4.31764816380201e-05, + "loss": 23.2935, + "step": 2636, + "task_loss": 1.2144086360931396 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.21114558803433403, + "compression/movement_sparsity/importance_threshold": -0.005524927112671954, + "compression/movement_sparsity/linear_layer_sparsity": 0.08492784495376857, + "compression/movement_sparsity/model_sparsity": 0.08201031221099594, + "compression_loss": 22.750207901000977, + "distillation_loss": 0.9153242707252502, + "epoch": 2.23, + "learning_rate": 4.317178547947779e-05, + "loss": 23.4697, + "step": 2637, + "task_loss": 0.7148751616477966 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.2118664418538433, + "compression/movement_sparsity/importance_threshold": -0.005519878443676422, + "compression/movement_sparsity/linear_layer_sparsity": 0.08551243919628439, + "compression/movement_sparsity/model_sparsity": 0.08257482384285911, + "compression_loss": 22.82757568359375, + "distillation_loss": 0.33101460337638855, + "epoch": 2.23, + "learning_rate": 4.3167089320935477e-05, + "loss": 23.3392, + "step": 2638, + "task_loss": 0.04958055540919304 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.21258685639620623, + "compression/movement_sparsity/importance_threshold": -0.005514832851261534, + "compression/movement_sparsity/linear_layer_sparsity": 0.08619643330021276, + "compression/movement_sparsity/model_sparsity": 0.08323532064510555, + "compression_loss": 22.9049015045166, + "distillation_loss": 0.6901434659957886, + "epoch": 2.23, + "learning_rate": 4.316239316239317e-05, + "loss": 23.6203, + "step": 2639, + "task_loss": 0.4386698305606842 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.21330683179530796, + "compression/movement_sparsity/importance_threshold": -0.005509790334489596, + "compression/movement_sparsity/linear_layer_sparsity": 0.0868466343130611, + "compression/movement_sparsity/model_sparsity": 0.08386318525291027, + "compression_loss": 22.98215675354004, + "distillation_loss": 0.5602027177810669, + "epoch": 2.23, + "learning_rate": 4.315769700385085e-05, + "loss": 23.5612, + "step": 2640, + "task_loss": 0.577620267868042 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.21402636818503362, + "compression/movement_sparsity/importance_threshold": -0.005504750892422913, + "compression/movement_sparsity/linear_layer_sparsity": 0.0874571516960173, + "compression/movement_sparsity/model_sparsity": 0.08445272948559077, + "compression_loss": 23.059404373168945, + "distillation_loss": 0.7182589769363403, + "epoch": 2.23, + "learning_rate": 4.3153000845308536e-05, + "loss": 23.7557, + "step": 2641, + "task_loss": 1.621577501296997 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.21474546569926822, + "compression/movement_sparsity/importance_threshold": -0.005499714524123786, + "compression/movement_sparsity/linear_layer_sparsity": 0.08797609147153004, + "compression/movement_sparsity/model_sparsity": 0.08495384208336917, + "compression_loss": 23.13658332824707, + "distillation_loss": 0.5763847231864929, + "epoch": 2.23, + "learning_rate": 4.314830468676623e-05, + "loss": 23.7796, + "step": 2642, + "task_loss": 0.7239302396774292 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.2154641244718971, + "compression/movement_sparsity/importance_threshold": -0.00549468122865452, + "compression/movement_sparsity/linear_layer_sparsity": 0.08858682348950368, + "compression/movement_sparsity/model_sparsity": 0.08554359357769396, + "compression_loss": 23.213701248168945, + "distillation_loss": 0.2924191653728485, + "epoch": 2.23, + "learning_rate": 4.3143608528223915e-05, + "loss": 23.7901, + "step": 2643, + "task_loss": 0.8006916642189026 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.2161823446368053, + "compression/movement_sparsity/importance_threshold": -0.005489651005077421, + "compression/movement_sparsity/linear_layer_sparsity": 0.0893377598705398, + "compression/movement_sparsity/model_sparsity": 0.08626873298389097, + "compression_loss": 23.290775299072266, + "distillation_loss": 0.5186960101127625, + "epoch": 2.23, + "learning_rate": 4.31389123696816e-05, + "loss": 23.9969, + "step": 2644, + "task_loss": 0.6446994543075562 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.2169001263278777, + "compression/movement_sparsity/importance_threshold": -0.005484623852454792, + "compression/movement_sparsity/linear_layer_sparsity": 0.09000170944867228, + "compression/movement_sparsity/model_sparsity": 0.0869098738514668, + "compression_loss": 23.36781120300293, + "distillation_loss": 0.5910739898681641, + "epoch": 2.24, + "learning_rate": 4.313421621113929e-05, + "loss": 23.8928, + "step": 2645, + "task_loss": 0.5940378904342651 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.21761746967899998, + "compression/movement_sparsity/importance_threshold": -0.0054795997698489345, + "compression/movement_sparsity/linear_layer_sparsity": 0.09055423960441526, + "compression/movement_sparsity/model_sparsity": 0.08744342289657843, + "compression_loss": 23.44477081298828, + "distillation_loss": 0.3057408630847931, + "epoch": 2.24, + "learning_rate": 4.312952005259698e-05, + "loss": 23.9449, + "step": 2646, + "task_loss": 0.33401161432266235 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.21833437482405704, + "compression/movement_sparsity/importance_threshold": -0.005474578756322155, + "compression/movement_sparsity/linear_layer_sparsity": 0.09131432182202807, + "compression/movement_sparsity/model_sparsity": 0.08817739395172984, + "compression_loss": 23.52169418334961, + "distillation_loss": 0.32448580861091614, + "epoch": 2.24, + "learning_rate": 4.312482389405467e-05, + "loss": 24.0271, + "step": 2647, + "task_loss": 0.8096652030944824 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.2190508418969339, + "compression/movement_sparsity/importance_threshold": -0.005469560810936756, + "compression/movement_sparsity/linear_layer_sparsity": 0.09198590286744752, + "compression/movement_sparsity/model_sparsity": 0.08882590412221418, + "compression_loss": 23.598539352416992, + "distillation_loss": 0.35537588596343994, + "epoch": 2.24, + "learning_rate": 4.3120127735512354e-05, + "loss": 23.9784, + "step": 2648, + "task_loss": 0.3334835469722748 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.21976687103151593, + "compression/movement_sparsity/importance_threshold": -0.005464545932755043, + "compression/movement_sparsity/linear_layer_sparsity": 0.09271547114008016, + "compression/movement_sparsity/model_sparsity": 0.08953040948026736, + "compression_loss": 23.675331115722656, + "distillation_loss": 0.3941245675086975, + "epoch": 2.24, + "learning_rate": 4.311543157697004e-05, + "loss": 24.3316, + "step": 2649, + "task_loss": 0.7037419080734253 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.220482462361688, + "compression/movement_sparsity/importance_threshold": -0.005459534120839319, + "compression/movement_sparsity/linear_layer_sparsity": 0.09344505133688044, + "compression/movement_sparsity/model_sparsity": 0.09023492635285633, + "compression_loss": 23.75208282470703, + "distillation_loss": 0.8570448756217957, + "epoch": 2.24, + "learning_rate": 4.3110735418427726e-05, + "loss": 24.4649, + "step": 2650, + "task_loss": 0.5637337565422058 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.22119761602133547, + "compression/movement_sparsity/importance_threshold": -0.005454525374251888, + "compression/movement_sparsity/linear_layer_sparsity": 0.09423874778305875, + "compression/movement_sparsity/model_sparsity": 0.09100135688441256, + "compression_loss": 23.82878303527832, + "distillation_loss": 0.3655344545841217, + "epoch": 2.24, + "learning_rate": 4.310603925988542e-05, + "loss": 24.4007, + "step": 2651, + "task_loss": 0.6456930637359619 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.22191233214434336, + "compression/movement_sparsity/importance_threshold": -0.005449519692055055, + "compression/movement_sparsity/linear_layer_sparsity": 0.09487372163383609, + "compression/movement_sparsity/model_sparsity": 0.09161451743000766, + "compression_loss": 23.905481338500977, + "distillation_loss": 0.5830286741256714, + "epoch": 2.24, + "learning_rate": 4.3101343101343106e-05, + "loss": 24.5498, + "step": 2652, + "task_loss": 0.5962634682655334 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.2226266108645969, + "compression/movement_sparsity/importance_threshold": -0.005444517073311122, + "compression/movement_sparsity/linear_layer_sparsity": 0.09561263845389525, + "compression/movement_sparsity/model_sparsity": 0.09232805018412377, + "compression_loss": 23.98211669921875, + "distillation_loss": 0.5124383568763733, + "epoch": 2.24, + "learning_rate": 4.309664694280079e-05, + "loss": 24.5545, + "step": 2653, + "task_loss": 0.18641330301761627 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.2233404523159811, + "compression/movement_sparsity/importance_threshold": -0.005439517517082395, + "compression/movement_sparsity/linear_layer_sparsity": 0.09640648991425284, + "compression/movement_sparsity/model_sparsity": 0.09309463040464531, + "compression_loss": 24.058698654174805, + "distillation_loss": 0.3970463275909424, + "epoch": 2.24, + "learning_rate": 4.309195078425848e-05, + "loss": 24.4963, + "step": 2654, + "task_loss": 1.1362547874450684 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.2240538566323813, + "compression/movement_sparsity/importance_threshold": -0.005434521022431176, + "compression/movement_sparsity/linear_layer_sparsity": 0.09717575374094527, + "compression/movement_sparsity/model_sparsity": 0.09383746765235852, + "compression_loss": 24.13520050048828, + "distillation_loss": 0.27783599495887756, + "epoch": 2.24, + "learning_rate": 4.3087254625716165e-05, + "loss": 24.6677, + "step": 2655, + "task_loss": 1.9453346729278564 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.2247668239476821, + "compression/movement_sparsity/importance_threshold": -0.005429527588419774, + "compression/movement_sparsity/linear_layer_sparsity": 0.09798030117967221, + "compression/movement_sparsity/model_sparsity": 0.09461437641148777, + "compression_loss": 24.211641311645508, + "distillation_loss": 0.5363422632217407, + "epoch": 2.24, + "learning_rate": 4.308255846717386e-05, + "loss": 24.9002, + "step": 2656, + "task_loss": 1.1082571744918823 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.22547935439576938, + "compression/movement_sparsity/importance_threshold": -0.005424537214110486, + "compression/movement_sparsity/linear_layer_sparsity": 0.0987314283473905, + "compression/movement_sparsity/model_sparsity": 0.09533970005025749, + "compression_loss": 24.288034439086914, + "distillation_loss": 0.6401575207710266, + "epoch": 2.25, + "learning_rate": 4.307786230863154e-05, + "loss": 24.8531, + "step": 2657, + "task_loss": 1.1238659620285034 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.22619144811052805, + "compression/movement_sparsity/importance_threshold": -0.005419549898565619, + "compression/movement_sparsity/linear_layer_sparsity": 0.0994396404357873, + "compression/movement_sparsity/model_sparsity": 0.09602358287470265, + "compression_loss": 24.364395141601562, + "distillation_loss": 0.5819977521896362, + "epoch": 2.25, + "learning_rate": 4.307316615008923e-05, + "loss": 25.0142, + "step": 2658, + "task_loss": 0.9651921391487122 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.22690310522584312, + "compression/movement_sparsity/importance_threshold": -0.005414565640847477, + "compression/movement_sparsity/linear_layer_sparsity": 0.10020909504916191, + "compression/movement_sparsity/model_sparsity": 0.09676660435498857, + "compression_loss": 24.44072151184082, + "distillation_loss": 0.40604087710380554, + "epoch": 2.25, + "learning_rate": 4.306846999154692e-05, + "loss": 25.0095, + "step": 2659, + "task_loss": 1.1372036933898926 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.2276143258755997, + "compression/movement_sparsity/importance_threshold": -0.005409584440018365, + "compression/movement_sparsity/linear_layer_sparsity": 0.10101194925608457, + "compression/movement_sparsity/model_sparsity": 0.09754187805003502, + "compression_loss": 24.51698112487793, + "distillation_loss": 0.6189146041870117, + "epoch": 2.25, + "learning_rate": 4.30637738330046e-05, + "loss": 25.2917, + "step": 2660, + "task_loss": 0.4809848964214325 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.22832511019368307, + "compression/movement_sparsity/importance_threshold": -0.005404606295140585, + "compression/movement_sparsity/linear_layer_sparsity": 0.10173543620322292, + "compression/movement_sparsity/model_sparsity": 0.09824051099483298, + "compression_loss": 24.593198776245117, + "distillation_loss": 0.6007993817329407, + "epoch": 2.25, + "learning_rate": 4.305907767446229e-05, + "loss": 25.0992, + "step": 2661, + "task_loss": 0.20378516614437103 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.2290354583139782, + "compression/movement_sparsity/importance_threshold": -0.005399631205276443, + "compression/movement_sparsity/linear_layer_sparsity": 0.10242535661846632, + "compression/movement_sparsity/model_sparsity": 0.09890673052136932, + "compression_loss": 24.669363021850586, + "distillation_loss": 0.668059766292572, + "epoch": 2.25, + "learning_rate": 4.3054381515919976e-05, + "loss": 25.1931, + "step": 2662, + "task_loss": 0.9398033022880554 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.22974537037037046, + "compression/movement_sparsity/importance_threshold": -0.005394659169488241, + "compression/movement_sparsity/linear_layer_sparsity": 0.10314883164143704, + "compression/movement_sparsity/model_sparsity": 0.09960535195163149, + "compression_loss": 24.7454833984375, + "distillation_loss": 0.6150960326194763, + "epoch": 2.25, + "learning_rate": 4.304968535737767e-05, + "loss": 25.4581, + "step": 2663, + "task_loss": 0.9186568260192871 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.23045484649674486, + "compression/movement_sparsity/importance_threshold": -0.0053896901868382835, + "compression/movement_sparsity/linear_layer_sparsity": 0.10380839312587953, + "compression/movement_sparsity/model_sparsity": 0.10024225547003493, + "compression_loss": 24.821561813354492, + "distillation_loss": 0.5443013310432434, + "epoch": 2.25, + "learning_rate": 4.3044989198835355e-05, + "loss": 25.3143, + "step": 2664, + "task_loss": 0.8609303832054138 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.2311638868269862, + "compression/movement_sparsity/importance_threshold": -0.0053847242563888785, + "compression/movement_sparsity/linear_layer_sparsity": 0.10450153306638461, + "compression/movement_sparsity/model_sparsity": 0.10091158392123578, + "compression_loss": 24.8975772857666, + "distillation_loss": 0.3234819769859314, + "epoch": 2.25, + "learning_rate": 4.304029304029304e-05, + "loss": 25.3424, + "step": 2665, + "task_loss": 0.2265138328075409 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.23187249149498024, + "compression/movement_sparsity/importance_threshold": -0.0053797613772023235, + "compression/movement_sparsity/linear_layer_sparsity": 0.1053259461683929, + "compression/movement_sparsity/model_sparsity": 0.10170767589699875, + "compression_loss": 24.973535537719727, + "distillation_loss": 0.7286474704742432, + "epoch": 2.25, + "learning_rate": 4.303559688175073e-05, + "loss": 25.4991, + "step": 2666, + "task_loss": 1.1004260778427124 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.2325806606346119, + "compression/movement_sparsity/importance_threshold": -0.005374801548340926, + "compression/movement_sparsity/linear_layer_sparsity": 0.10605860280044324, + "compression/movement_sparsity/model_sparsity": 0.10241516351982272, + "compression_loss": 25.049453735351562, + "distillation_loss": 1.0396184921264648, + "epoch": 2.25, + "learning_rate": 4.3030900723208414e-05, + "loss": 25.7959, + "step": 2667, + "task_loss": 1.1754810810089111 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.23328839437976623, + "compression/movement_sparsity/importance_threshold": -0.005369844768866989, + "compression/movement_sparsity/linear_layer_sparsity": 0.1069381294052645, + "compression/movement_sparsity/model_sparsity": 0.10326447568002804, + "compression_loss": 25.125316619873047, + "distillation_loss": 0.7221930027008057, + "epoch": 2.26, + "learning_rate": 4.302620456466611e-05, + "loss": 25.7886, + "step": 2668, + "task_loss": 0.7294876575469971 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.23399569286432842, + "compression/movement_sparsity/importance_threshold": -0.005364891037842817, + "compression/movement_sparsity/linear_layer_sparsity": 0.10767685543864147, + "compression/movement_sparsity/model_sparsity": 0.10397782420157144, + "compression_loss": 25.201139450073242, + "distillation_loss": 0.7972719669342041, + "epoch": 2.26, + "learning_rate": 4.3021508406123794e-05, + "loss": 25.789, + "step": 2669, + "task_loss": 0.6992921829223633 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.2347025562221836, + "compression/movement_sparsity/importance_threshold": -0.005359940354330713, + "compression/movement_sparsity/linear_layer_sparsity": 0.10856211756809558, + "compression/movement_sparsity/model_sparsity": 0.10483267485349394, + "compression_loss": 25.276906967163086, + "distillation_loss": 0.43806836009025574, + "epoch": 2.26, + "learning_rate": 4.301681224758148e-05, + "loss": 25.7421, + "step": 2670, + "task_loss": 0.3824222981929779 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.23540898458721693, + "compression/movement_sparsity/importance_threshold": -0.005354992717392983, + "compression/movement_sparsity/linear_layer_sparsity": 0.10926136268243022, + "compression/movement_sparsity/model_sparsity": 0.1055078987470216, + "compression_loss": 25.352636337280273, + "distillation_loss": 0.8374093174934387, + "epoch": 2.26, + "learning_rate": 4.3012116089039166e-05, + "loss": 26.0694, + "step": 2671, + "task_loss": 0.28923991322517395 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.2361149780933135, + "compression/movement_sparsity/importance_threshold": -0.005350048126091929, + "compression/movement_sparsity/linear_layer_sparsity": 0.1100203359524529, + "compression/movement_sparsity/model_sparsity": 0.10624079895034413, + "compression_loss": 25.428356170654297, + "distillation_loss": 0.43708139657974243, + "epoch": 2.26, + "learning_rate": 4.300741993049685e-05, + "loss": 25.9787, + "step": 2672, + "task_loss": 1.574414849281311 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.23682053687435844, + "compression/movement_sparsity/importance_threshold": -0.005345106579489856, + "compression/movement_sparsity/linear_layer_sparsity": 0.1106674963025541, + "compression/movement_sparsity/model_sparsity": 0.10686572735152124, + "compression_loss": 25.503999710083008, + "distillation_loss": 0.8075686693191528, + "epoch": 2.26, + "learning_rate": 4.3002723771954546e-05, + "loss": 26.0629, + "step": 2673, + "task_loss": 0.993084192276001 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.23752566106423678, + "compression/movement_sparsity/importance_threshold": -0.005340168076649069, + "compression/movement_sparsity/linear_layer_sparsity": 0.11130551081607859, + "compression/movement_sparsity/model_sparsity": 0.10748182410374395, + "compression_loss": 25.579599380493164, + "distillation_loss": 1.2198083400726318, + "epoch": 2.26, + "learning_rate": 4.2998027613412225e-05, + "loss": 26.2822, + "step": 2674, + "task_loss": 0.8923405408859253 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.2382303507968342, + "compression/movement_sparsity/importance_threshold": -0.0053352326166318685, + "compression/movement_sparsity/linear_layer_sparsity": 0.11205972634321455, + "compression/movement_sparsity/model_sparsity": 0.10821013000728444, + "compression_loss": 25.655153274536133, + "distillation_loss": 0.430992990732193, + "epoch": 2.26, + "learning_rate": 4.299333145486992e-05, + "loss": 26.1573, + "step": 2675, + "task_loss": 0.9558709263801575 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.23893460620603502, + "compression/movement_sparsity/importance_threshold": -0.005330300198500563, + "compression/movement_sparsity/linear_layer_sparsity": 0.11275744516409181, + "compression/movement_sparsity/model_sparsity": 0.1088838800402304, + "compression_loss": 25.73065185546875, + "distillation_loss": 0.726870059967041, + "epoch": 2.26, + "learning_rate": 4.2988635296327605e-05, + "loss": 26.6212, + "step": 2676, + "task_loss": 0.8127005100250244 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.23963842742572505, + "compression/movement_sparsity/importance_threshold": -0.005325370821317453, + "compression/movement_sparsity/linear_layer_sparsity": 0.11351776586505734, + "compression/movement_sparsity/model_sparsity": 0.10961808138609772, + "compression_loss": 25.80610466003418, + "distillation_loss": 0.4320061206817627, + "epoch": 2.26, + "learning_rate": 4.29839391377853e-05, + "loss": 26.1872, + "step": 2677, + "task_loss": 0.13300257921218872 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.2403418145897892, + "compression/movement_sparsity/importance_threshold": -0.005320444484144844, + "compression/movement_sparsity/linear_layer_sparsity": 0.11421835841033484, + "compression/movement_sparsity/model_sparsity": 0.11029460642217016, + "compression_loss": 25.88150405883789, + "distillation_loss": 0.8299775123596191, + "epoch": 2.26, + "learning_rate": 4.2979242979242984e-05, + "loss": 26.4282, + "step": 2678, + "task_loss": 0.6602033376693726 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.2410447678321126, + "compression/movement_sparsity/importance_threshold": -0.005315521186045039, + "compression/movement_sparsity/linear_layer_sparsity": 0.11499066289977442, + "compression/movement_sparsity/model_sparsity": 0.11104037987651098, + "compression_loss": 25.956823348999023, + "distillation_loss": 0.36130291223526, + "epoch": 2.26, + "learning_rate": 4.297454682070067e-05, + "loss": 26.5739, + "step": 2679, + "task_loss": 0.7530396580696106 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.24174728728658035, + "compression/movement_sparsity/importance_threshold": -0.005310600926080343, + "compression/movement_sparsity/linear_layer_sparsity": 0.11569907769902103, + "compression/movement_sparsity/model_sparsity": 0.11172445844806465, + "compression_loss": 26.032115936279297, + "distillation_loss": 0.6786223649978638, + "epoch": 2.27, + "learning_rate": 4.296985066215836e-05, + "loss": 26.6305, + "step": 2680, + "task_loss": 1.134964942932129 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.2424493730870777, + "compression/movement_sparsity/importance_threshold": -0.005305683703313059, + "compression/movement_sparsity/linear_layer_sparsity": 0.1163613221211816, + "compression/movement_sparsity/model_sparsity": 0.11236395273702185, + "compression_loss": 26.107370376586914, + "distillation_loss": 0.5917405486106873, + "epoch": 2.27, + "learning_rate": 4.296515450361604e-05, + "loss": 26.7383, + "step": 2681, + "task_loss": 0.8230888843536377 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.24315102536748967, + "compression/movement_sparsity/importance_threshold": -0.005300769516805493, + "compression/movement_sparsity/linear_layer_sparsity": 0.11708786165523473, + "compression/movement_sparsity/model_sparsity": 0.11306553340298323, + "compression_loss": 26.182559967041016, + "distillation_loss": 0.6224719285964966, + "epoch": 2.27, + "learning_rate": 4.2960458345073736e-05, + "loss": 26.837, + "step": 2682, + "task_loss": 0.9494261741638184 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.2438522442617015, + "compression/movement_sparsity/importance_threshold": -0.005295858365619946, + "compression/movement_sparsity/linear_layer_sparsity": 0.11786633093934204, + "compression/movement_sparsity/model_sparsity": 0.11381725987232982, + "compression_loss": 26.257699966430664, + "distillation_loss": 0.616378664970398, + "epoch": 2.27, + "learning_rate": 4.2955762186531416e-05, + "loss": 26.7586, + "step": 2683, + "task_loss": 1.1050190925598145 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.24455302990359828, + "compression/movement_sparsity/importance_threshold": -0.005290950248818723, + "compression/movement_sparsity/linear_layer_sparsity": 0.11858961517563059, + "compression/movement_sparsity/model_sparsity": 0.11451569707001928, + "compression_loss": 26.33282470703125, + "distillation_loss": 1.0202922821044922, + "epoch": 2.27, + "learning_rate": 4.295106602798911e-05, + "loss": 26.9108, + "step": 2684, + "task_loss": 0.9176163673400879 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.24525338242706485, + "compression/movement_sparsity/importance_threshold": -0.0052860451654641325, + "compression/movement_sparsity/linear_layer_sparsity": 0.1191909986461777, + "compression/movement_sparsity/model_sparsity": 0.11509642116828116, + "compression_loss": 26.40787124633789, + "distillation_loss": 0.37210893630981445, + "epoch": 2.27, + "learning_rate": 4.2946369869446796e-05, + "loss": 26.9026, + "step": 2685, + "task_loss": 1.5308212041854858 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.24595330196598697, + "compression/movement_sparsity/importance_threshold": -0.005281143114618471, + "compression/movement_sparsity/linear_layer_sparsity": 0.11997283054555834, + "compression/movement_sparsity/model_sparsity": 0.1158513947367218, + "compression_loss": 26.48292350769043, + "distillation_loss": 0.8762179017066956, + "epoch": 2.27, + "learning_rate": 4.294167371090448e-05, + "loss": 27.1784, + "step": 2686, + "task_loss": 0.8934304714202881 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.24665278865424944, + "compression/movement_sparsity/importance_threshold": -0.005276244095344047, + "compression/movement_sparsity/linear_layer_sparsity": 0.1206138976459976, + "compression/movement_sparsity/model_sparsity": 0.11647043921010791, + "compression_loss": 26.557924270629883, + "distillation_loss": 0.8376360535621643, + "epoch": 2.27, + "learning_rate": 4.293697755236217e-05, + "loss": 27.3217, + "step": 2687, + "task_loss": 1.2399060726165771 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.2473518426257375, + "compression/movement_sparsity/importance_threshold": -0.005271348106703163, + "compression/movement_sparsity/linear_layer_sparsity": 0.1212490265109542, + "compression/movement_sparsity/model_sparsity": 0.11708374944466833, + "compression_loss": 26.632892608642578, + "distillation_loss": 0.62364262342453, + "epoch": 2.27, + "learning_rate": 4.2932281393819855e-05, + "loss": 27.3694, + "step": 2688, + "task_loss": 1.3516508340835571 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.24805046401433617, + "compression/movement_sparsity/importance_threshold": -0.005266455147758123, + "compression/movement_sparsity/linear_layer_sparsity": 0.12196339146985112, + "compression/movement_sparsity/model_sparsity": 0.11777357376958347, + "compression_loss": 26.70783042907715, + "distillation_loss": 0.3320034146308899, + "epoch": 2.27, + "learning_rate": 4.292758523527755e-05, + "loss": 27.2209, + "step": 2689, + "task_loss": 0.23361852765083313 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.24874865295393067, + "compression/movement_sparsity/importance_threshold": -0.005261565217571232, + "compression/movement_sparsity/linear_layer_sparsity": 0.12246403957221047, + "compression/movement_sparsity/model_sparsity": 0.11825702306945307, + "compression_loss": 26.782724380493164, + "distillation_loss": 0.5519956946372986, + "epoch": 2.27, + "learning_rate": 4.2922889076735234e-05, + "loss": 27.356, + "step": 2690, + "task_loss": 1.7913850545883179 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.24944640957840603, + "compression/movement_sparsity/importance_threshold": -0.0052566783152047936, + "compression/movement_sparsity/linear_layer_sparsity": 0.12316328468654511, + "compression/movement_sparsity/model_sparsity": 0.11893224696298073, + "compression_loss": 26.85756492614746, + "distillation_loss": 0.6947104930877686, + "epoch": 2.27, + "learning_rate": 4.291819291819292e-05, + "loss": 27.5564, + "step": 2691, + "task_loss": 1.0279330015182495 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.2501437340216476, + "compression/movement_sparsity/importance_threshold": -0.005251794439721111, + "compression/movement_sparsity/linear_layer_sparsity": 0.12376778036484519, + "compression/movement_sparsity/model_sparsity": 0.11951597635508499, + "compression_loss": 26.932369232177734, + "distillation_loss": 0.5363233685493469, + "epoch": 2.28, + "learning_rate": 4.291349675965061e-05, + "loss": 27.4958, + "step": 2692, + "task_loss": 0.7387153506278992 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.25084062641754024, + "compression/movement_sparsity/importance_threshold": -0.0052469135901824894, + "compression/movement_sparsity/linear_layer_sparsity": 0.1243664689735066, + "compression/movement_sparsity/model_sparsity": 0.12009409816825728, + "compression_loss": 27.007102966308594, + "distillation_loss": 0.41162562370300293, + "epoch": 2.28, + "learning_rate": 4.290880060110829e-05, + "loss": 27.5631, + "step": 2693, + "task_loss": 0.5103681087493896 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.25153708689996934, + "compression/movement_sparsity/importance_threshold": -0.005242035765651232, + "compression/movement_sparsity/linear_layer_sparsity": 0.12496022097676676, + "compression/movement_sparsity/model_sparsity": 0.12066745296361064, + "compression_loss": 27.08177375793457, + "distillation_loss": 0.6186453104019165, + "epoch": 2.28, + "learning_rate": 4.2904104442565986e-05, + "loss": 27.6486, + "step": 2694, + "task_loss": 1.090255856513977 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.2522331156028199, + "compression/movement_sparsity/importance_threshold": -0.005237160965189642, + "compression/movement_sparsity/linear_layer_sparsity": 0.12549767898461803, + "compression/movement_sparsity/model_sparsity": 0.12118644763547798, + "compression_loss": 27.156396865844727, + "distillation_loss": 0.5647468566894531, + "epoch": 2.28, + "learning_rate": 4.289940828402367e-05, + "loss": 27.7291, + "step": 2695, + "task_loss": 0.6037712693214417 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.25292871265997685, + "compression/movement_sparsity/importance_threshold": -0.005232289187860027, + "compression/movement_sparsity/linear_layer_sparsity": 0.12612195685702599, + "compression/movement_sparsity/model_sparsity": 0.12178927964246537, + "compression_loss": 27.23097038269043, + "distillation_loss": 0.4375886619091034, + "epoch": 2.28, + "learning_rate": 4.289471212548136e-05, + "loss": 27.8029, + "step": 2696, + "task_loss": 1.437133550643921 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.25362387820532584, + "compression/movement_sparsity/importance_threshold": -0.005227420432724686, + "compression/movement_sparsity/linear_layer_sparsity": 0.1266073062523085, + "compression/movement_sparsity/model_sparsity": 0.12225795579291057, + "compression_loss": 27.305477142333984, + "distillation_loss": 1.0117186307907104, + "epoch": 2.28, + "learning_rate": 4.2890015966939045e-05, + "loss": 28.0432, + "step": 2697, + "task_loss": 0.5592117309570312 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.2543186123727518, + "compression/movement_sparsity/importance_threshold": -0.0052225546988459255, + "compression/movement_sparsity/linear_layer_sparsity": 0.12726474523491182, + "compression/movement_sparsity/model_sparsity": 0.12289280972394258, + "compression_loss": 27.37994384765625, + "distillation_loss": 0.4647761285305023, + "epoch": 2.28, + "learning_rate": 4.288531980839673e-05, + "loss": 28.0377, + "step": 2698, + "task_loss": 0.0678580030798912 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.2550129152961398, + "compression/movement_sparsity/importance_threshold": -0.005217691985286049, + "compression/movement_sparsity/linear_layer_sparsity": 0.127994361204215, + "compression/movement_sparsity/model_sparsity": 0.12359736114013893, + "compression_loss": 27.45435905456543, + "distillation_loss": 0.9605753421783447, + "epoch": 2.28, + "learning_rate": 4.2880623649854425e-05, + "loss": 28.1652, + "step": 2699, + "task_loss": 2.166354179382324 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.255706787109375, + "compression/movement_sparsity/importance_threshold": -0.005212832291107361, + "compression/movement_sparsity/linear_layer_sparsity": 0.12869359439438202, + "compression/movement_sparsity/model_sparsity": 0.1242725735191308, + "compression_loss": 27.52870750427246, + "distillation_loss": 0.3408774733543396, + "epoch": 2.28, + "learning_rate": 4.2875927491312104e-05, + "loss": 28.0606, + "step": 2700, + "task_loss": 0.9944912195205688 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.25640022794634254, + "compression/movement_sparsity/importance_threshold": -0.005207975615372165, + "compression/movement_sparsity/linear_layer_sparsity": 0.1293686096000806, + "compression/movement_sparsity/model_sparsity": 0.12492439987592396, + "compression_loss": 27.602998733520508, + "distillation_loss": 0.30047574639320374, + "epoch": 2.28, + "learning_rate": 4.28712313327698e-05, + "loss": 27.995, + "step": 2701, + "task_loss": 0.11674318462610245 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.25709323794092764, + "compression/movement_sparsity/importance_threshold": -0.005203121957142765, + "compression/movement_sparsity/linear_layer_sparsity": 0.130003547678355, + "compression/movement_sparsity/model_sparsity": 0.12553752587791167, + "compression_loss": 27.677228927612305, + "distillation_loss": 0.3178798258304596, + "epoch": 2.28, + "learning_rate": 4.2866535174227484e-05, + "loss": 27.9639, + "step": 2702, + "task_loss": 0.40651875734329224 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.2577858172270152, + "compression/movement_sparsity/importance_threshold": -0.005198271315481466, + "compression/movement_sparsity/linear_layer_sparsity": 0.1306783363248685, + "compression/movement_sparsity/model_sparsity": 0.12618913345852473, + "compression_loss": 27.75139617919922, + "distillation_loss": 0.6656696200370789, + "epoch": 2.28, + "learning_rate": 4.286183901568518e-05, + "loss": 28.4294, + "step": 2703, + "task_loss": 0.8249683976173401 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.2584779659384907, + "compression/movement_sparsity/importance_threshold": -0.0051934236894505695, + "compression/movement_sparsity/linear_layer_sparsity": 0.1314443687021316, + "compression/movement_sparsity/model_sparsity": 0.12692885026703762, + "compression_loss": 27.825525283813477, + "distillation_loss": 0.5948188304901123, + "epoch": 2.29, + "learning_rate": 4.2857142857142856e-05, + "loss": 28.5727, + "step": 2704, + "task_loss": 1.3130605220794678 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.2591696842092388, + "compression/movement_sparsity/importance_threshold": -0.005188579078112384, + "compression/movement_sparsity/linear_layer_sparsity": 0.13224436110882168, + "compression/movement_sparsity/model_sparsity": 0.12770136047349337, + "compression_loss": 27.899625778198242, + "distillation_loss": 0.6174355745315552, + "epoch": 2.29, + "learning_rate": 4.285244669860054e-05, + "loss": 28.567, + "step": 2705, + "task_loss": 0.6900280117988586 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.2598609721731452, + "compression/movement_sparsity/importance_threshold": -0.005183737480529209, + "compression/movement_sparsity/linear_layer_sparsity": 0.1330290786567702, + "compression/movement_sparsity/model_sparsity": 0.1284591205595963, + "compression_loss": 27.973655700683594, + "distillation_loss": 0.46737998723983765, + "epoch": 2.29, + "learning_rate": 4.2847750540058236e-05, + "loss": 28.4288, + "step": 2706, + "task_loss": 0.6539822816848755 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.26055182996409476, + "compression/movement_sparsity/importance_threshold": -0.005178898895763349, + "compression/movement_sparsity/linear_layer_sparsity": 0.13371898714784594, + "compression/movement_sparsity/model_sparsity": 0.12912532857159684, + "compression_loss": 28.047645568847656, + "distillation_loss": 0.3651961386203766, + "epoch": 2.29, + "learning_rate": 4.284305438151592e-05, + "loss": 28.5704, + "step": 2707, + "task_loss": 0.5148100256919861 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.26124225771597265, + "compression/movement_sparsity/importance_threshold": -0.00517406332287711, + "compression/movement_sparsity/linear_layer_sparsity": 0.1344594660338654, + "compression/movement_sparsity/model_sparsity": 0.12984036972990204, + "compression_loss": 28.121553421020508, + "distillation_loss": 0.5852566361427307, + "epoch": 2.29, + "learning_rate": 4.283835822297361e-05, + "loss": 28.6833, + "step": 2708, + "task_loss": 0.4743536412715912 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.2619322555626641, + "compression/movement_sparsity/importance_threshold": -0.0051692307609327946, + "compression/movement_sparsity/linear_layer_sparsity": 0.13507759103577327, + "compression/movement_sparsity/model_sparsity": 0.13043726023641944, + "compression_loss": 28.195451736450195, + "distillation_loss": 0.5227553844451904, + "epoch": 2.29, + "learning_rate": 4.2833662064431295e-05, + "loss": 28.8209, + "step": 2709, + "task_loss": 0.5351753234863281 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.2626218236380541, + "compression/movement_sparsity/importance_threshold": -0.005164401208992708, + "compression/movement_sparsity/linear_layer_sparsity": 0.13567320320918463, + "compression/movement_sparsity/model_sparsity": 0.13101241129935676, + "compression_loss": 28.269306182861328, + "distillation_loss": 0.3377804160118103, + "epoch": 2.29, + "learning_rate": 4.282896590588899e-05, + "loss": 28.7456, + "step": 2710, + "task_loss": 0.4308357238769531 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.2633109620760279, + "compression/movement_sparsity/importance_threshold": -0.005159574666119151, + "compression/movement_sparsity/linear_layer_sparsity": 0.13635741194813045, + "compression/movement_sparsity/model_sparsity": 0.1316731153632475, + "compression_loss": 28.34311866760254, + "distillation_loss": 0.7233242988586426, + "epoch": 2.29, + "learning_rate": 4.2824269747346674e-05, + "loss": 28.9653, + "step": 2711, + "task_loss": 1.068498134613037 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.2639996710104705, + "compression/movement_sparsity/importance_threshold": -0.005154751131374432, + "compression/movement_sparsity/linear_layer_sparsity": 0.13705034917778572, + "compression/movement_sparsity/model_sparsity": 0.13234224806733985, + "compression_loss": 28.416860580444336, + "distillation_loss": 0.5429000854492188, + "epoch": 2.29, + "learning_rate": 4.281957358880436e-05, + "loss": 29.0215, + "step": 2712, + "task_loss": 0.8164119124412537 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.2646879505752672, + "compression/movement_sparsity/importance_threshold": -0.005149930603820853, + "compression/movement_sparsity/linear_layer_sparsity": 0.1377448127008984, + "compression/movement_sparsity/model_sparsity": 0.13301285463201393, + "compression_loss": 28.490558624267578, + "distillation_loss": 0.5070241689682007, + "epoch": 2.29, + "learning_rate": 4.281487743026205e-05, + "loss": 29.0943, + "step": 2713, + "task_loss": 0.33749064803123474 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.265375800904303, + "compression/movement_sparsity/importance_threshold": -0.0051451130825207175, + "compression/movement_sparsity/linear_layer_sparsity": 0.13842879488065912, + "compression/movement_sparsity/model_sparsity": 0.13367333991972458, + "compression_loss": 28.564218521118164, + "distillation_loss": 0.47675377130508423, + "epoch": 2.29, + "learning_rate": 4.281018127171973e-05, + "loss": 29.2126, + "step": 2714, + "task_loss": 0.824842095375061 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.2660632221314633, + "compression/movement_sparsity/importance_threshold": -0.00514029856653633, + "compression/movement_sparsity/linear_layer_sparsity": 0.13915836315329175, + "compression/movement_sparsity/model_sparsity": 0.13437784527777777, + "compression_loss": 28.637840270996094, + "distillation_loss": 0.5140252113342285, + "epoch": 2.29, + "learning_rate": 4.2805485113177426e-05, + "loss": 29.2284, + "step": 2715, + "task_loss": 0.7775077819824219 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.2667502143906326, + "compression/movement_sparsity/importance_threshold": -0.005135487054929996, + "compression/movement_sparsity/linear_layer_sparsity": 0.13994288991455808, + "compression/movement_sparsity/model_sparsity": 0.13513542113130797, + "compression_loss": 28.711400985717773, + "distillation_loss": 0.742621123790741, + "epoch": 2.3, + "learning_rate": 4.280078895463511e-05, + "loss": 29.4505, + "step": 2716, + "task_loss": 1.4261082410812378 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.26743677781569686, + "compression/movement_sparsity/importance_threshold": -0.005130678546764016, + "compression/movement_sparsity/linear_layer_sparsity": 0.14063892742779868, + "compression/movement_sparsity/model_sparsity": 0.13580754761470692, + "compression_loss": 28.784929275512695, + "distillation_loss": 0.5026398301124573, + "epoch": 2.3, + "learning_rate": 4.27960927960928e-05, + "loss": 29.3382, + "step": 2717, + "task_loss": 0.4673776626586914 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.26812291254054077, + "compression/movement_sparsity/importance_threshold": -0.005125873041100695, + "compression/movement_sparsity/linear_layer_sparsity": 0.14139141395062746, + "compression/movement_sparsity/model_sparsity": 0.13653418391055722, + "compression_loss": 28.85843276977539, + "distillation_loss": 0.8410535454750061, + "epoch": 2.3, + "learning_rate": 4.2791396637550485e-05, + "loss": 29.4094, + "step": 2718, + "task_loss": 0.5263946652412415 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.2688086186990496, + "compression/movement_sparsity/importance_threshold": -0.005121070537002339, + "compression/movement_sparsity/linear_layer_sparsity": 0.14202483765961213, + "compression/movement_sparsity/model_sparsity": 0.137145847566499, + "compression_loss": 28.931859970092773, + "distillation_loss": 0.8662497997283936, + "epoch": 2.3, + "learning_rate": 4.278670047900817e-05, + "loss": 29.472, + "step": 2719, + "task_loss": 1.3620120286941528 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.26949389642510846, + "compression/movement_sparsity/importance_threshold": -0.005116271033531251, + "compression/movement_sparsity/linear_layer_sparsity": 0.1428006001576661, + "compression/movement_sparsity/model_sparsity": 0.13789496023622025, + "compression_loss": 29.005239486694336, + "distillation_loss": 0.5457649230957031, + "epoch": 2.3, + "learning_rate": 4.2782004320465865e-05, + "loss": 29.6469, + "step": 2720, + "task_loss": 0.6805235147476196 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.2701787458526025, + "compression/movement_sparsity/importance_threshold": -0.005111474529749733, + "compression/movement_sparsity/linear_layer_sparsity": 0.14356551166317144, + "compression/movement_sparsity/model_sparsity": 0.13863359467836844, + "compression_loss": 29.07856559753418, + "distillation_loss": 0.32495689392089844, + "epoch": 2.3, + "learning_rate": 4.2777308161923544e-05, + "loss": 29.5127, + "step": 2721, + "task_loss": 0.1851312667131424 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.2708631671154169, + "compression/movement_sparsity/importance_threshold": -0.0051066810247200915, + "compression/movement_sparsity/linear_layer_sparsity": 0.14431035479454563, + "compression/movement_sparsity/model_sparsity": 0.13935285015677443, + "compression_loss": 29.151853561401367, + "distillation_loss": 0.44361308217048645, + "epoch": 2.3, + "learning_rate": 4.277261200338124e-05, + "loss": 29.5889, + "step": 2722, + "task_loss": 0.11154741793870926 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.27154716034743664, + "compression/movement_sparsity/importance_threshold": -0.005101890517504631, + "compression/movement_sparsity/linear_layer_sparsity": 0.14512277218391226, + "compression/movement_sparsity/model_sparsity": 0.14013735850952808, + "compression_loss": 29.225065231323242, + "distillation_loss": 0.3395731449127197, + "epoch": 2.3, + "learning_rate": 4.2767915844838924e-05, + "loss": 29.799, + "step": 2723, + "task_loss": 0.5562704205513 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.272230725682547, + "compression/movement_sparsity/importance_threshold": -0.005097103007165652, + "compression/movement_sparsity/linear_layer_sparsity": 0.14584204989987515, + "compression/movement_sparsity/model_sparsity": 0.1408319268231906, + "compression_loss": 29.298229217529297, + "distillation_loss": 0.6963639855384827, + "epoch": 2.3, + "learning_rate": 4.276321968629661e-05, + "loss": 29.8716, + "step": 2724, + "task_loss": 0.3846713900566101 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.27291386325463274, + "compression/movement_sparsity/importance_threshold": -0.005092318492765465, + "compression/movement_sparsity/linear_layer_sparsity": 0.14664526183182688, + "compression/movement_sparsity/model_sparsity": 0.14160754595431085, + "compression_loss": 29.371349334716797, + "distillation_loss": 0.5544654726982117, + "epoch": 2.3, + "learning_rate": 4.27585235277543e-05, + "loss": 30.005, + "step": 2725, + "task_loss": 0.5377510786056519 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.27359657319758, + "compression/movement_sparsity/importance_threshold": -0.005087536973366364, + "compression/movement_sparsity/linear_layer_sparsity": 0.14736566041954755, + "compression/movement_sparsity/model_sparsity": 0.14230319663433805, + "compression_loss": 29.444408416748047, + "distillation_loss": 0.5729637145996094, + "epoch": 2.3, + "learning_rate": 4.275382736921198e-05, + "loss": 30.0073, + "step": 2726, + "task_loss": 0.36497941613197327 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.27427885564527266, + "compression/movement_sparsity/importance_threshold": -0.0050827584480306635, + "compression/movement_sparsity/linear_layer_sparsity": 0.1481015246526919, + "compression/movement_sparsity/model_sparsity": 0.14301378166729073, + "compression_loss": 29.517473220825195, + "distillation_loss": 0.9229831695556641, + "epoch": 2.3, + "learning_rate": 4.2749131210669676e-05, + "loss": 30.2028, + "step": 2727, + "task_loss": 0.811104416847229 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.2749607107315968, + "compression/movement_sparsity/importance_threshold": -0.00507798291582066, + "compression/movement_sparsity/linear_layer_sparsity": 0.14883112869782747, + "compression/movement_sparsity/model_sparsity": 0.1437183215689513, + "compression_loss": 29.590482711791992, + "distillation_loss": 0.44955721497535706, + "epoch": 2.31, + "learning_rate": 4.274443505212736e-05, + "loss": 30.1516, + "step": 2728, + "task_loss": 0.3513813614845276 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.27564213859043707, + "compression/movement_sparsity/importance_threshold": -0.005073210375798661, + "compression/movement_sparsity/linear_layer_sparsity": 0.1496141291656364, + "compression/movement_sparsity/model_sparsity": 0.14447442356189982, + "compression_loss": 29.66345977783203, + "distillation_loss": 0.4217585623264313, + "epoch": 2.31, + "learning_rate": 4.273973889358505e-05, + "loss": 30.2841, + "step": 2729, + "task_loss": 0.7661373615264893 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.27632313935567887, + "compression/movement_sparsity/importance_threshold": -0.005068440827026968, + "compression/movement_sparsity/linear_layer_sparsity": 0.1502887151013001, + "compression/movement_sparsity/model_sparsity": 0.14512583539540438, + "compression_loss": 29.736377716064453, + "distillation_loss": 0.3985757827758789, + "epoch": 2.31, + "learning_rate": 4.2735042735042735e-05, + "loss": 30.3012, + "step": 2730, + "task_loss": 0.39602574706077576 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.2770037131612072, + "compression/movement_sparsity/importance_threshold": -0.005063674268567888, + "compression/movement_sparsity/linear_layer_sparsity": 0.15123041431617473, + "compression/movement_sparsity/model_sparsity": 0.14603518434524246, + "compression_loss": 29.809223175048828, + "distillation_loss": 0.7248553037643433, + "epoch": 2.31, + "learning_rate": 4.273034657650042e-05, + "loss": 30.6188, + "step": 2731, + "task_loss": 0.7075513601303101 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.2776838601409073, + "compression/movement_sparsity/importance_threshold": -0.0050589106994837225, + "compression/movement_sparsity/linear_layer_sparsity": 0.1519691999703899, + "compression/movement_sparsity/model_sparsity": 0.1467485904394648, + "compression_loss": 29.88204002380371, + "distillation_loss": 0.5001782774925232, + "epoch": 2.31, + "learning_rate": 4.2725650417958114e-05, + "loss": 30.4618, + "step": 2732, + "task_loss": 0.422024667263031 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.27836358042866427, + "compression/movement_sparsity/importance_threshold": -0.005054150118836777, + "compression/movement_sparsity/linear_layer_sparsity": 0.15271559324355674, + "compression/movement_sparsity/model_sparsity": 0.14746934280752408, + "compression_loss": 29.954801559448242, + "distillation_loss": 0.5487746596336365, + "epoch": 2.31, + "learning_rate": 4.27209542594158e-05, + "loss": 30.7355, + "step": 2733, + "task_loss": 0.8000732660293579 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.27904287415836315, + "compression/movement_sparsity/importance_threshold": -0.005049392525689355, + "compression/movement_sparsity/linear_layer_sparsity": 0.15335245111282056, + "compression/movement_sparsity/model_sparsity": 0.14808432264977472, + "compression_loss": 30.02753257751465, + "distillation_loss": 0.47658079862594604, + "epoch": 2.31, + "learning_rate": 4.271625810087349e-05, + "loss": 30.5457, + "step": 2734, + "task_loss": 0.42671340703964233 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.2797217414638892, + "compression/movement_sparsity/importance_threshold": -0.00504463791910376, + "compression/movement_sparsity/linear_layer_sparsity": 0.15396908559377395, + "compression/movement_sparsity/model_sparsity": 0.1486797738393178, + "compression_loss": 30.100200653076172, + "distillation_loss": 0.30988606810569763, + "epoch": 2.31, + "learning_rate": 4.2711561942331173e-05, + "loss": 30.5613, + "step": 2735, + "task_loss": 0.7972187995910645 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.2804001824791271, + "compression/movement_sparsity/importance_threshold": -0.005039886298142299, + "compression/movement_sparsity/linear_layer_sparsity": 0.15474139008321353, + "compression/movement_sparsity/model_sparsity": 0.14942554729365862, + "compression_loss": 30.17279815673828, + "distillation_loss": 0.8626722097396851, + "epoch": 2.31, + "learning_rate": 4.270686578378886e-05, + "loss": 30.8724, + "step": 2736, + "task_loss": 1.3828822374343872 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.2810781973379629, + "compression/movement_sparsity/importance_threshold": -0.005035137661867271, + "compression/movement_sparsity/linear_layer_sparsity": 0.15543779724565082, + "compression/movement_sparsity/model_sparsity": 0.1500980307276672, + "compression_loss": 30.245351791381836, + "distillation_loss": 0.32391732931137085, + "epoch": 2.31, + "learning_rate": 4.270216962524655e-05, + "loss": 30.8046, + "step": 2737, + "task_loss": 0.612346351146698 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.2817557861742811, + "compression/movement_sparsity/importance_threshold": -0.0050303920093409836, + "compression/movement_sparsity/linear_layer_sparsity": 0.15619793908410182, + "compression/movement_sparsity/model_sparsity": 0.15083205935549757, + "compression_loss": 30.317852020263672, + "distillation_loss": 0.5382668972015381, + "epoch": 2.31, + "learning_rate": 4.269747346670423e-05, + "loss": 30.8806, + "step": 2738, + "task_loss": 0.9251440763473511 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.28243294912196704, + "compression/movement_sparsity/importance_threshold": -0.005025649339625739, + "compression/movement_sparsity/linear_layer_sparsity": 0.15690938262192794, + "compression/movement_sparsity/model_sparsity": 0.15151906261914305, + "compression_loss": 30.390329360961914, + "distillation_loss": 0.3689855635166168, + "epoch": 2.32, + "learning_rate": 4.2692777308161926e-05, + "loss": 30.9945, + "step": 2739, + "task_loss": 0.7400779128074646 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.28310968631490585, + "compression/movement_sparsity/importance_threshold": -0.005020909651783841, + "compression/movement_sparsity/linear_layer_sparsity": 0.15775497304565755, + "compression/movement_sparsity/model_sparsity": 0.15233560441047714, + "compression_loss": 30.462745666503906, + "distillation_loss": 0.7798584699630737, + "epoch": 2.32, + "learning_rate": 4.268808114961961e-05, + "loss": 31.1542, + "step": 2740, + "task_loss": 1.557666301727295 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.28378599788698256, + "compression/movement_sparsity/importance_threshold": -0.005016172944877595, + "compression/movement_sparsity/linear_layer_sparsity": 0.15854138382541033, + "compression/movement_sparsity/model_sparsity": 0.1530949995606629, + "compression_loss": 30.535112380981445, + "distillation_loss": 0.8682030439376831, + "epoch": 2.32, + "learning_rate": 4.2683384991077305e-05, + "loss": 31.4185, + "step": 2741, + "task_loss": 1.2488949298858643 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.2844618839720825, + "compression/movement_sparsity/importance_threshold": -0.0050114392179693035, + "compression/movement_sparsity/linear_layer_sparsity": 0.15918415608182154, + "compression/movement_sparsity/model_sparsity": 0.1537156906126676, + "compression_loss": 30.60744285583496, + "distillation_loss": 0.5128454566001892, + "epoch": 2.32, + "learning_rate": 4.267868883253499e-05, + "loss": 31.1254, + "step": 2742, + "task_loss": 0.4483601152896881 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.2851373447040906, + "compression/movement_sparsity/importance_threshold": -0.005006708470121272, + "compression/movement_sparsity/linear_layer_sparsity": 0.1599839577018294, + "compression/movement_sparsity/model_sparsity": 0.15448801658655065, + "compression_loss": 30.679731369018555, + "distillation_loss": 0.726369321346283, + "epoch": 2.32, + "learning_rate": 4.267399267399267e-05, + "loss": 31.1988, + "step": 2743, + "task_loss": 2.1011123657226562 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.28581238021689204, + "compression/movement_sparsity/importance_threshold": -0.0050019807003958035, + "compression/movement_sparsity/linear_layer_sparsity": 0.16066796372992542, + "compression/movement_sparsity/model_sparsity": 0.1551485249033329, + "compression_loss": 30.75197410583496, + "distillation_loss": 0.6059142351150513, + "epoch": 2.32, + "learning_rate": 4.2669296515450364e-05, + "loss": 31.3472, + "step": 2744, + "task_loss": 0.28157737851142883 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.2864869906443719, + "compression/movement_sparsity/importance_threshold": -0.004997255907855204, + "compression/movement_sparsity/linear_layer_sparsity": 0.16140496075899521, + "compression/movement_sparsity/model_sparsity": 0.15586020381718607, + "compression_loss": 30.824174880981445, + "distillation_loss": 0.6148947477340698, + "epoch": 2.32, + "learning_rate": 4.266460035690805e-05, + "loss": 31.3927, + "step": 2745, + "task_loss": 0.8020234107971191 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.2871611761204158, + "compression/movement_sparsity/importance_threshold": -0.004992534091561773, + "compression/movement_sparsity/linear_layer_sparsity": 0.16212117396387568, + "compression/movement_sparsity/model_sparsity": 0.15655181289514936, + "compression_loss": 30.896345138549805, + "distillation_loss": 0.4758932590484619, + "epoch": 2.32, + "learning_rate": 4.2659904198365744e-05, + "loss": 31.6064, + "step": 2746, + "task_loss": 0.5901272296905518 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.28783493677890815, + "compression/movement_sparsity/importance_threshold": -0.004987815250577819, + "compression/movement_sparsity/linear_layer_sparsity": 0.16274734777893776, + "compression/movement_sparsity/model_sparsity": 0.15715647571332808, + "compression_loss": 30.968469619750977, + "distillation_loss": 0.5715819597244263, + "epoch": 2.32, + "learning_rate": 4.265520803982342e-05, + "loss": 31.7002, + "step": 2747, + "task_loss": 0.5360541343688965 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.2885082727537346, + "compression/movement_sparsity/importance_threshold": -0.0049830993839656435, + "compression/movement_sparsity/linear_layer_sparsity": 0.16332754200359595, + "compression/movement_sparsity/model_sparsity": 0.15771673848148304, + "compression_loss": 31.040565490722656, + "distillation_loss": 0.9705923199653625, + "epoch": 2.32, + "learning_rate": 4.2650511881281116e-05, + "loss": 32.0055, + "step": 2748, + "task_loss": 0.4924909174442291 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.28918118417878014, + "compression/movement_sparsity/importance_threshold": -0.004978386490787552, + "compression/movement_sparsity/linear_layer_sparsity": 0.16395792504983348, + "compression/movement_sparsity/model_sparsity": 0.15832546593079724, + "compression_loss": 31.11263084411621, + "distillation_loss": 1.2930912971496582, + "epoch": 2.32, + "learning_rate": 4.26458157227388e-05, + "loss": 32.1368, + "step": 2749, + "task_loss": 1.662111759185791 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.28985367118793004, + "compression/movement_sparsity/importance_threshold": -0.004973676570105846, + "compression/movement_sparsity/linear_layer_sparsity": 0.1646846315222335, + "compression/movement_sparsity/model_sparsity": 0.15902720780025975, + "compression_loss": 31.184650421142578, + "distillation_loss": 0.7548186779022217, + "epoch": 2.32, + "learning_rate": 4.264111956419649e-05, + "loss": 31.9204, + "step": 2750, + "task_loss": 1.3140308856964111 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.2905257339150692, + "compression/movement_sparsity/importance_threshold": -0.004968969620982832, + "compression/movement_sparsity/linear_layer_sparsity": 0.16547683744745734, + "compression/movement_sparsity/model_sparsity": 0.15979219901484165, + "compression_loss": 31.256622314453125, + "distillation_loss": 0.9272348880767822, + "epoch": 2.33, + "learning_rate": 4.2636423405654175e-05, + "loss": 32.0777, + "step": 2751, + "task_loss": 1.0073527097702026 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.29119737249408295, + "compression/movement_sparsity/importance_threshold": -0.004964265642480812, + "compression/movement_sparsity/linear_layer_sparsity": 0.16614864505206187, + "compression/movement_sparsity/model_sparsity": 0.16044092796150608, + "compression_loss": 31.328554153442383, + "distillation_loss": 1.0228999853134155, + "epoch": 2.33, + "learning_rate": 4.263172724711186e-05, + "loss": 32.2044, + "step": 2752, + "task_loss": 1.1393083333969116 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.2918685870588563, + "compression/movement_sparsity/importance_threshold": -0.004959564633662093, + "compression/movement_sparsity/linear_layer_sparsity": 0.166756348331456, + "compression/movement_sparsity/model_sparsity": 0.16102775476373907, + "compression_loss": 31.40045166015625, + "distillation_loss": 0.24133244156837463, + "epoch": 2.33, + "learning_rate": 4.2627031088569555e-05, + "loss": 31.9701, + "step": 2753, + "task_loss": 0.4456300735473633 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.29253937774327454, + "compression/movement_sparsity/importance_threshold": -0.004954866593588975, + "compression/movement_sparsity/linear_layer_sparsity": 0.167522809978754, + "compression/movement_sparsity/model_sparsity": 0.16176788609554055, + "compression_loss": 31.472288131713867, + "distillation_loss": 0.2931180000305176, + "epoch": 2.33, + "learning_rate": 4.262233493002724e-05, + "loss": 32.0299, + "step": 2754, + "task_loss": 0.31185248494148254 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.29320974468122263, + "compression/movement_sparsity/importance_threshold": -0.004950171521323766, + "compression/movement_sparsity/linear_layer_sparsity": 0.1681213316490685, + "compression/movement_sparsity/model_sparsity": 0.16234584670521174, + "compression_loss": 31.544086456298828, + "distillation_loss": 0.8400447368621826, + "epoch": 2.33, + "learning_rate": 4.261763877148493e-05, + "loss": 32.294, + "step": 2755, + "task_loss": 1.5892164707183838 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.2938796880065856, + "compression/movement_sparsity/importance_threshold": -0.004945479415928769, + "compression/movement_sparsity/linear_layer_sparsity": 0.16871813623924345, + "compression/movement_sparsity/model_sparsity": 0.16292214922172849, + "compression_loss": 31.615821838378906, + "distillation_loss": 0.5703263878822327, + "epoch": 2.33, + "learning_rate": 4.2612942612942614e-05, + "loss": 32.357, + "step": 2756, + "task_loss": 0.8560513257980347 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.294549207853249, + "compression/movement_sparsity/importance_threshold": -0.004940790276466285, + "compression/movement_sparsity/linear_layer_sparsity": 0.16926456122115688, + "compression/movement_sparsity/model_sparsity": 0.16344980282451332, + "compression_loss": 31.68752670288086, + "distillation_loss": 0.6552695631980896, + "epoch": 2.33, + "learning_rate": 4.26082464544003e-05, + "loss": 32.3677, + "step": 2757, + "task_loss": 1.5018903017044067 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.29521830435509777, + "compression/movement_sparsity/importance_threshold": -0.00493610410199862, + "compression/movement_sparsity/linear_layer_sparsity": 0.17000480162382361, + "compression/movement_sparsity/model_sparsity": 0.1641646136921026, + "compression_loss": 31.7591609954834, + "distillation_loss": 0.6357307434082031, + "epoch": 2.33, + "learning_rate": 4.260355029585799e-05, + "loss": 32.5979, + "step": 2758, + "task_loss": 0.7213913202285767 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.295886977646017, + "compression/movement_sparsity/importance_threshold": -0.004931420891588078, + "compression/movement_sparsity/linear_layer_sparsity": 0.17069354154647107, + "compression/movement_sparsity/model_sparsity": 0.1648296932795953, + "compression_loss": 31.830745697021484, + "distillation_loss": 0.6367267370223999, + "epoch": 2.33, + "learning_rate": 4.259885413731568e-05, + "loss": 32.5319, + "step": 2759, + "task_loss": 1.1770179271697998 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.29655522785989197, + "compression/movement_sparsity/importance_threshold": -0.004926740644296964, + "compression/movement_sparsity/linear_layer_sparsity": 0.17134394527016922, + "compression/movement_sparsity/model_sparsity": 0.16545775363450854, + "compression_loss": 31.902311325073242, + "distillation_loss": 0.6723724603652954, + "epoch": 2.33, + "learning_rate": 4.2594157978773366e-05, + "loss": 32.4794, + "step": 2760, + "task_loss": 1.36005699634552 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.29722305513060765, + "compression/movement_sparsity/importance_threshold": -0.00492206335918758, + "compression/movement_sparsity/linear_layer_sparsity": 0.1719671022708194, + "compression/movement_sparsity/model_sparsity": 0.16605950327513122, + "compression_loss": 31.97381591796875, + "distillation_loss": 0.5588663816452026, + "epoch": 2.33, + "learning_rate": 4.258946182023105e-05, + "loss": 32.6599, + "step": 2761, + "task_loss": 0.1758635938167572 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.2978904595920493, + "compression/movement_sparsity/importance_threshold": -0.004917389035322231, + "compression/movement_sparsity/linear_layer_sparsity": 0.17272389341816471, + "compression/movement_sparsity/model_sparsity": 0.16679029631840336, + "compression_loss": 32.04526138305664, + "distillation_loss": 0.5710480213165283, + "epoch": 2.33, + "learning_rate": 4.258476566168874e-05, + "loss": 32.6436, + "step": 2762, + "task_loss": 1.478938341140747 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.29855744137810203, + "compression/movement_sparsity/importance_threshold": -0.00491271767176322, + "compression/movement_sparsity/linear_layer_sparsity": 0.17366256389445986, + "compression/movement_sparsity/model_sparsity": 0.1676967205761496, + "compression_loss": 32.11666488647461, + "distillation_loss": 0.581039547920227, + "epoch": 2.34, + "learning_rate": 4.258006950314643e-05, + "loss": 32.6167, + "step": 2763, + "task_loss": 0.32748207449913025 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.299224000622651, + "compression/movement_sparsity/importance_threshold": -0.004908049267572852, + "compression/movement_sparsity/linear_layer_sparsity": 0.17435265124805016, + "compression/movement_sparsity/model_sparsity": 0.16836310130618706, + "compression_loss": 32.187984466552734, + "distillation_loss": 0.7095096111297607, + "epoch": 2.34, + "learning_rate": 4.257537334460411e-05, + "loss": 32.7396, + "step": 2764, + "task_loss": 0.3692638874053955 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.29989013745958093, + "compression/movement_sparsity/importance_threshold": -0.004903383821813434, + "compression/movement_sparsity/linear_layer_sparsity": 0.17487615797976747, + "compression/movement_sparsity/model_sparsity": 0.16886862397117477, + "compression_loss": 32.259281158447266, + "distillation_loss": 0.4133455753326416, + "epoch": 2.34, + "learning_rate": 4.2570677186061804e-05, + "loss": 32.7346, + "step": 2765, + "task_loss": 0.44254234433174133 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.300555852022778, + "compression/movement_sparsity/importance_threshold": -0.004898721333547261, + "compression/movement_sparsity/linear_layer_sparsity": 0.1754136159876187, + "compression/movement_sparsity/model_sparsity": 0.16938761864304214, + "compression_loss": 32.33050537109375, + "distillation_loss": 0.5406585931777954, + "epoch": 2.34, + "learning_rate": 4.256598102751949e-05, + "loss": 32.9984, + "step": 2766, + "task_loss": 0.42210081219673157 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.30122114444612613, + "compression/movement_sparsity/importance_threshold": -0.004894061801836647, + "compression/movement_sparsity/linear_layer_sparsity": 0.17619222836173765, + "compression/movement_sparsity/model_sparsity": 0.17013948328681824, + "compression_loss": 32.40168380737305, + "distillation_loss": 0.6512346267700195, + "epoch": 2.34, + "learning_rate": 4.256128486897718e-05, + "loss": 33.1394, + "step": 2767, + "task_loss": 1.1285040378570557 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.3018860148635112, + "compression/movement_sparsity/importance_threshold": -0.004889405225743892, + "compression/movement_sparsity/linear_layer_sparsity": 0.1769645924720154, + "compression/movement_sparsity/model_sparsity": 0.17088531431383805, + "compression_loss": 32.47282409667969, + "distillation_loss": 0.6796640753746033, + "epoch": 2.34, + "learning_rate": 4.255658871043486e-05, + "loss": 33.1129, + "step": 2768, + "task_loss": 0.12275484204292297 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.3025504634088183, + "compression/movement_sparsity/importance_threshold": -0.004884751604331298, + "compression/movement_sparsity/linear_layer_sparsity": 0.1777203104442735, + "compression/movement_sparsity/model_sparsity": 0.17161507104888865, + "compression_loss": 32.543914794921875, + "distillation_loss": 0.4159230589866638, + "epoch": 2.34, + "learning_rate": 4.255189255189255e-05, + "loss": 33.3077, + "step": 2769, + "task_loss": 0.3649306893348694 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.3032144902159324, + "compression/movement_sparsity/importance_threshold": -0.004880100936661171, + "compression/movement_sparsity/linear_layer_sparsity": 0.17826672350201928, + "compression/movement_sparsity/model_sparsity": 0.1721427131371377, + "compression_loss": 32.61498260498047, + "distillation_loss": 0.6360831260681152, + "epoch": 2.34, + "learning_rate": 4.254719639335024e-05, + "loss": 33.3857, + "step": 2770, + "task_loss": 0.8902615904808044 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.30387809541873867, + "compression/movement_sparsity/importance_threshold": -0.004875453221795815, + "compression/movement_sparsity/linear_layer_sparsity": 0.1789047141672085, + "compression/movement_sparsity/model_sparsity": 0.1727587868602888, + "compression_loss": 32.68601989746094, + "distillation_loss": 0.5262614488601685, + "epoch": 2.34, + "learning_rate": 4.254250023480793e-05, + "loss": 33.3303, + "step": 2771, + "task_loss": 0.7351558208465576 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.3045412791511223, + "compression/movement_sparsity/importance_threshold": -0.004870808458797534, + "compression/movement_sparsity/linear_layer_sparsity": 0.17961903142943486, + "compression/movement_sparsity/model_sparsity": 0.17344856512706078, + "compression_loss": 32.75699996948242, + "distillation_loss": 0.6561937928199768, + "epoch": 2.34, + "learning_rate": 4.253780407626562e-05, + "loss": 33.4392, + "step": 2772, + "task_loss": 0.26177799701690674 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.30520404154696845, + "compression/movement_sparsity/importance_threshold": -0.004866166646728631, + "compression/movement_sparsity/linear_layer_sparsity": 0.18038525459338017, + "compression/movement_sparsity/model_sparsity": 0.17418846616814637, + "compression_loss": 32.82796096801758, + "distillation_loss": 0.4219472408294678, + "epoch": 2.34, + "learning_rate": 4.25331079177233e-05, + "loss": 33.2983, + "step": 2773, + "task_loss": 0.7018374800682068 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.3058663827401621, + "compression/movement_sparsity/importance_threshold": -0.004861527784651411, + "compression/movement_sparsity/linear_layer_sparsity": 0.1809776353173622, + "compression/movement_sparsity/model_sparsity": 0.17476049679188338, + "compression_loss": 32.89884948730469, + "distillation_loss": 0.5737485289573669, + "epoch": 2.34, + "learning_rate": 4.2528411759180995e-05, + "loss": 33.5797, + "step": 2774, + "task_loss": 0.25343072414398193 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.30652830286458854, + "compression/movement_sparsity/importance_threshold": -0.004856891871628177, + "compression/movement_sparsity/linear_layer_sparsity": 0.18176370029625355, + "compression/movement_sparsity/model_sparsity": 0.1755195580205311, + "compression_loss": 32.969688415527344, + "distillation_loss": 0.4943506121635437, + "epoch": 2.35, + "learning_rate": 4.252371560063868e-05, + "loss": 33.5114, + "step": 2775, + "task_loss": 0.27319368720054626 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.3071898020541327, + "compression/movement_sparsity/importance_threshold": -0.004852258906721235, + "compression/movement_sparsity/linear_layer_sparsity": 0.1824946636964996, + "compression/movement_sparsity/model_sparsity": 0.17622541057927224, + "compression_loss": 33.04048156738281, + "distillation_loss": 0.7300631999969482, + "epoch": 2.35, + "learning_rate": 4.251901944209637e-05, + "loss": 33.6166, + "step": 2776, + "task_loss": 0.9830614328384399 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.30785088044268005, + "compression/movement_sparsity/importance_threshold": -0.004847628888992888, + "compression/movement_sparsity/linear_layer_sparsity": 0.18323243579646573, + "compression/movement_sparsity/model_sparsity": 0.17693783793795206, + "compression_loss": 33.111209869384766, + "distillation_loss": 0.6662949323654175, + "epoch": 2.35, + "learning_rate": 4.2514323283554054e-05, + "loss": 33.6301, + "step": 2777, + "task_loss": 0.518382728099823 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.3085115381641157, + "compression/movement_sparsity/importance_threshold": -0.004843001817505436, + "compression/movement_sparsity/linear_layer_sparsity": 0.18400340477913008, + "compression/movement_sparsity/model_sparsity": 0.1776823217642839, + "compression_loss": 33.18187713623047, + "distillation_loss": 0.5028715133666992, + "epoch": 2.35, + "learning_rate": 4.250962712501174e-05, + "loss": 33.7198, + "step": 2778, + "task_loss": 0.20833563804626465 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.30917177535232454, + "compression/movement_sparsity/importance_threshold": -0.004838377691321189, + "compression/movement_sparsity/linear_layer_sparsity": 0.1848401951671444, + "compression/movement_sparsity/model_sparsity": 0.17849036582820157, + "compression_loss": 33.25251388549805, + "distillation_loss": 0.5396803617477417, + "epoch": 2.35, + "learning_rate": 4.2504930966469433e-05, + "loss": 33.9177, + "step": 2779, + "task_loss": 0.5210409164428711 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.3098315921411918, + "compression/movement_sparsity/importance_threshold": -0.004833756509502449, + "compression/movement_sparsity/linear_layer_sparsity": 0.18568737150516956, + "compression/movement_sparsity/model_sparsity": 0.1793084390527963, + "compression_loss": 33.32306671142578, + "distillation_loss": 0.449398934841156, + "epoch": 2.35, + "learning_rate": 4.250023480792712e-05, + "loss": 33.9821, + "step": 2780, + "task_loss": 0.5510128140449524 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.3104909886646028, + "compression/movement_sparsity/importance_threshold": -0.004829138271111519, + "compression/movement_sparsity/linear_layer_sparsity": 0.18646290744403846, + "compression/movement_sparsity/model_sparsity": 0.18005733294633747, + "compression_loss": 33.393585205078125, + "distillation_loss": 0.5781751871109009, + "epoch": 2.35, + "learning_rate": 4.2495538649384806e-05, + "loss": 34.0225, + "step": 2781, + "task_loss": 1.3404947519302368 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.31114996505644243, + "compression/movement_sparsity/importance_threshold": -0.0048245229752107025, + "compression/movement_sparsity/linear_layer_sparsity": 0.1870965577122082, + "compression/movement_sparsity/model_sparsity": 0.18066921537845937, + "compression_loss": 33.46406555175781, + "distillation_loss": 0.6171655654907227, + "epoch": 2.35, + "learning_rate": 4.249084249084249e-05, + "loss": 34.1901, + "step": 2782, + "task_loss": 1.2153964042663574 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.311808521450596, + "compression/movement_sparsity/importance_threshold": -0.0048199106208623044, + "compression/movement_sparsity/linear_layer_sparsity": 0.18786372288539674, + "compression/movement_sparsity/model_sparsity": 0.18141002606787274, + "compression_loss": 33.534523010253906, + "distillation_loss": 0.8364394307136536, + "epoch": 2.35, + "learning_rate": 4.248614633230018e-05, + "loss": 34.2247, + "step": 2783, + "task_loss": 0.8670456409454346 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.31246665798094864, + "compression/movement_sparsity/importance_threshold": -0.0048153012071286295, + "compression/movement_sparsity/linear_layer_sparsity": 0.18868203081357546, + "compression/movement_sparsity/model_sparsity": 0.1822002226013089, + "compression_loss": 33.60490417480469, + "distillation_loss": 0.6204716563224792, + "epoch": 2.35, + "learning_rate": 4.248145017375787e-05, + "loss": 34.2097, + "step": 2784, + "task_loss": 0.5522196292877197 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.3131243747813851, + "compression/movement_sparsity/importance_threshold": -0.004810694733071982, + "compression/movement_sparsity/linear_layer_sparsity": 0.18930477046835842, + "compression/movement_sparsity/model_sparsity": 0.18280156923317878, + "compression_loss": 33.675262451171875, + "distillation_loss": 0.6666215658187866, + "epoch": 2.35, + "learning_rate": 4.247675401521555e-05, + "loss": 34.4039, + "step": 2785, + "task_loss": 0.4958130121231079 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.3137816719857913, + "compression/movement_sparsity/importance_threshold": -0.004806091197754663, + "compression/movement_sparsity/linear_layer_sparsity": 0.19004654908865018, + "compression/movement_sparsity/model_sparsity": 0.18351786547588558, + "compression_loss": 33.74558639526367, + "distillation_loss": 0.807817816734314, + "epoch": 2.35, + "learning_rate": 4.2472057856673245e-05, + "loss": 34.5087, + "step": 2786, + "task_loss": 1.4793214797973633 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.3144385497280515, + "compression/movement_sparsity/importance_threshold": -0.00480149060023898, + "compression/movement_sparsity/linear_layer_sparsity": 0.19077004795995617, + "compression/movement_sparsity/model_sparsity": 0.18421650993521935, + "compression_loss": 33.81586837768555, + "distillation_loss": 1.3246328830718994, + "epoch": 2.36, + "learning_rate": 4.246736169813093e-05, + "loss": 34.6089, + "step": 2787, + "task_loss": 1.1179027557373047 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.3150950081420516, + "compression/movement_sparsity/importance_threshold": -0.004796892939587235, + "compression/movement_sparsity/linear_layer_sparsity": 0.19142027282113977, + "compression/movement_sparsity/model_sparsity": 0.18484439757209564, + "compression_loss": 33.886131286621094, + "distillation_loss": 0.7599340081214905, + "epoch": 2.36, + "learning_rate": 4.246266553958862e-05, + "loss": 34.52, + "step": 2788, + "task_loss": 0.9186345338821411 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.31575104736167636, + "compression/movement_sparsity/importance_threshold": -0.004792298214861731, + "compression/movement_sparsity/linear_layer_sparsity": 0.19197165825678972, + "compression/movement_sparsity/model_sparsity": 0.185376841221771, + "compression_loss": 33.956336975097656, + "distillation_loss": 0.8322329521179199, + "epoch": 2.36, + "learning_rate": 4.245796938104631e-05, + "loss": 34.6703, + "step": 2789, + "task_loss": 0.9146638512611389 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.31640666752081104, + "compression/movement_sparsity/importance_threshold": -0.004787706425124774, + "compression/movement_sparsity/linear_layer_sparsity": 0.19267244158874938, + "compression/movement_sparsity/model_sparsity": 0.18605355049041616, + "compression_loss": 34.026485443115234, + "distillation_loss": 0.43900004029273987, + "epoch": 2.36, + "learning_rate": 4.245327322250399e-05, + "loss": 34.6707, + "step": 2790, + "task_loss": 0.6692500710487366 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.31706186875334064, + "compression/movement_sparsity/importance_threshold": -0.004783117569438668, + "compression/movement_sparsity/linear_layer_sparsity": 0.19338239460562104, + "compression/movement_sparsity/model_sparsity": 0.18673911443708732, + "compression_loss": 34.096580505371094, + "distillation_loss": 0.5224969387054443, + "epoch": 2.36, + "learning_rate": 4.244857706396168e-05, + "loss": 34.6197, + "step": 2791, + "task_loss": 0.19285303354263306 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.3177166511931504, + "compression/movement_sparsity/importance_threshold": -0.0047785316468657146, + "compression/movement_sparsity/linear_layer_sparsity": 0.19422477742825658, + "compression/movement_sparsity/model_sparsity": 0.18755255881829266, + "compression_loss": 34.16664123535156, + "distillation_loss": 0.5225247740745544, + "epoch": 2.36, + "learning_rate": 4.244388090541937e-05, + "loss": 34.7464, + "step": 2792, + "task_loss": 0.36350250244140625 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.31837101497412545, + "compression/movement_sparsity/importance_threshold": -0.004773948656468221, + "compression/movement_sparsity/linear_layer_sparsity": 0.1948902294515112, + "compression/movement_sparsity/model_sparsity": 0.18819515051737862, + "compression_loss": 34.23663330078125, + "distillation_loss": 0.86546790599823, + "epoch": 2.36, + "learning_rate": 4.2439184746877056e-05, + "loss": 34.9285, + "step": 2793, + "task_loss": 1.1289341449737549 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.3190249602301509, + "compression/movement_sparsity/importance_threshold": -0.004769368597308489, + "compression/movement_sparsity/linear_layer_sparsity": 0.1957255531669063, + "compression/movement_sparsity/model_sparsity": 0.18900177829339357, + "compression_loss": 34.30657958984375, + "distillation_loss": 0.967835545539856, + "epoch": 2.36, + "learning_rate": 4.243448858833474e-05, + "loss": 35.0436, + "step": 2794, + "task_loss": 1.7970212697982788 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.31967848709511204, + "compression/movement_sparsity/importance_threshold": -0.004764791468448822, + "compression/movement_sparsity/linear_layer_sparsity": 0.1963498787359848, + "compression/movement_sparsity/model_sparsity": 0.18960465635852414, + "compression_loss": 34.376487731933594, + "distillation_loss": 0.27886083722114563, + "epoch": 2.36, + "learning_rate": 4.242979242979243e-05, + "loss": 34.9547, + "step": 2795, + "task_loss": 0.08439888060092926 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.3203315957028935, + "compression/movement_sparsity/importance_threshold": -0.004760217268951528, + "compression/movement_sparsity/linear_layer_sparsity": 0.19716861593419843, + "compression/movement_sparsity/model_sparsity": 0.1903952674152489, + "compression_loss": 34.44633102416992, + "distillation_loss": 1.0911598205566406, + "epoch": 2.36, + "learning_rate": 4.242509627125012e-05, + "loss": 35.1312, + "step": 2796, + "task_loss": 1.0435272455215454 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.3209842861873812, + "compression/movement_sparsity/importance_threshold": -0.004755645997878907, + "compression/movement_sparsity/linear_layer_sparsity": 0.19778697941945902, + "compression/movement_sparsity/model_sparsity": 0.19099238821248218, + "compression_loss": 34.516136169433594, + "distillation_loss": 0.4747304320335388, + "epoch": 2.36, + "learning_rate": 4.242040011270781e-05, + "loss": 35.0524, + "step": 2797, + "task_loss": 0.6980056762695312 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.32163655868245977, + "compression/movement_sparsity/importance_threshold": -0.004751077654293263, + "compression/movement_sparsity/linear_layer_sparsity": 0.19867531798416319, + "compression/movement_sparsity/model_sparsity": 0.1918502096146397, + "compression_loss": 34.58591079711914, + "distillation_loss": 0.8379515409469604, + "epoch": 2.36, + "learning_rate": 4.2415703954165494e-05, + "loss": 35.1267, + "step": 2798, + "task_loss": 0.7269981503486633 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.3222884133220144, + "compression/movement_sparsity/importance_threshold": -0.004746512237256903, + "compression/movement_sparsity/linear_layer_sparsity": 0.19949038253874496, + "compression/movement_sparsity/model_sparsity": 0.19263727419433974, + "compression_loss": 34.65562438964844, + "distillation_loss": 0.5053322315216064, + "epoch": 2.37, + "learning_rate": 4.241100779562318e-05, + "loss": 35.1413, + "step": 2799, + "task_loss": 1.0725977420806885 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.32293985023993044, + "compression/movement_sparsity/importance_threshold": -0.0047419497458321275, + "compression/movement_sparsity/linear_layer_sparsity": 0.2003283414951876, + "compression/movement_sparsity/model_sparsity": 0.19344644668276528, + "compression_loss": 34.72529220581055, + "distillation_loss": 0.9593079090118408, + "epoch": 2.37, + "learning_rate": 4.240631163708087e-05, + "loss": 35.4406, + "step": 2800, + "task_loss": 0.6038237810134888 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.32359086957009287, + "compression/movement_sparsity/importance_threshold": -0.004737390179081241, + "compression/movement_sparsity/linear_layer_sparsity": 0.20102935138633235, + "compression/movement_sparsity/model_sparsity": 0.19412337472759056, + "compression_loss": 34.79494094848633, + "distillation_loss": 0.3672041893005371, + "epoch": 2.37, + "learning_rate": 4.240161547853856e-05, + "loss": 35.3609, + "step": 2801, + "task_loss": 1.2621623277664185 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.3242414714463868, + "compression/movement_sparsity/importance_threshold": -0.00473283353606655, + "compression/movement_sparsity/linear_layer_sparsity": 0.2017316848600847, + "compression/movement_sparsity/model_sparsity": 0.194801580885889, + "compression_loss": 34.864505767822266, + "distillation_loss": 0.6510782837867737, + "epoch": 2.37, + "learning_rate": 4.2396919319996246e-05, + "loss": 35.4972, + "step": 2802, + "task_loss": 0.7390516400337219 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.3248916560026974, + "compression/movement_sparsity/importance_threshold": -0.004728279815850358, + "compression/movement_sparsity/linear_layer_sparsity": 0.20242369200466434, + "compression/movement_sparsity/model_sparsity": 0.19546981545618938, + "compression_loss": 34.93404006958008, + "distillation_loss": 0.7244553565979004, + "epoch": 2.37, + "learning_rate": 4.239222316145393e-05, + "loss": 35.5431, + "step": 2803, + "task_loss": 0.527935802936554 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.3255414233729099, + "compression/movement_sparsity/importance_threshold": -0.004723729017494966, + "compression/movement_sparsity/linear_layer_sparsity": 0.20311687964183997, + "compression/movement_sparsity/model_sparsity": 0.19613918996553342, + "compression_loss": 35.00353240966797, + "distillation_loss": 0.5351499319076538, + "epoch": 2.37, + "learning_rate": 4.238752700291162e-05, + "loss": 35.7491, + "step": 2804, + "task_loss": 0.7425433397293091 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.3261907736909093, + "compression/movement_sparsity/importance_threshold": -0.004719181140062681, + "compression/movement_sparsity/linear_layer_sparsity": 0.20386778025037317, + "compression/movement_sparsity/model_sparsity": 0.19686429482812304, + "compression_loss": 35.073001861572266, + "distillation_loss": 0.8405793905258179, + "epoch": 2.37, + "learning_rate": 4.238283084436931e-05, + "loss": 35.8512, + "step": 2805, + "task_loss": 1.8521146774291992 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.32683970709058086, + "compression/movement_sparsity/importance_threshold": -0.004714636182615806, + "compression/movement_sparsity/linear_layer_sparsity": 0.20461590252784723, + "compression/movement_sparsity/model_sparsity": 0.19758671680387252, + "compression_loss": 35.14237976074219, + "distillation_loss": 0.3683815002441406, + "epoch": 2.37, + "learning_rate": 4.2378134685827e-05, + "loss": 35.7502, + "step": 2806, + "task_loss": 0.2676860988140106 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.32748822370580943, + "compression/movement_sparsity/importance_threshold": -0.004710094144216646, + "compression/movement_sparsity/linear_layer_sparsity": 0.2052920863019741, + "compression/movement_sparsity/model_sparsity": 0.19823967158517355, + "compression_loss": 35.211734771728516, + "distillation_loss": 0.5893499851226807, + "epoch": 2.37, + "learning_rate": 4.237343852728468e-05, + "loss": 35.8177, + "step": 2807, + "task_loss": 0.6533238291740417 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.3281363236704806, + "compression/movement_sparsity/importance_threshold": -0.0047055550239275025, + "compression/movement_sparsity/linear_layer_sparsity": 0.20606610787155324, + "compression/movement_sparsity/model_sparsity": 0.1989871031326688, + "compression_loss": 35.281063079833984, + "distillation_loss": 0.8676565289497375, + "epoch": 2.37, + "learning_rate": 4.236874236874237e-05, + "loss": 35.9563, + "step": 2808, + "task_loss": 1.0786687135696411 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.3287840071184793, + "compression/movement_sparsity/importance_threshold": -0.004701018820810681, + "compression/movement_sparsity/linear_layer_sparsity": 0.20678062784462944, + "compression/movement_sparsity/model_sparsity": 0.19967707714654925, + "compression_loss": 35.350318908691406, + "distillation_loss": 0.7156251668930054, + "epoch": 2.37, + "learning_rate": 4.236404621020006e-05, + "loss": 35.9579, + "step": 2809, + "task_loss": 0.6859104633331299 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.3294312741836908, + "compression/movement_sparsity/importance_threshold": -0.004696485533928485, + "compression/movement_sparsity/linear_layer_sparsity": 0.2075331143674582, + "compression/movement_sparsity/model_sparsity": 0.20040371344239954, + "compression_loss": 35.419551849365234, + "distillation_loss": 0.758513331413269, + "epoch": 2.38, + "learning_rate": 4.235935005165775e-05, + "loss": 36.21, + "step": 2810, + "task_loss": 0.7702853083610535 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.330078125, + "compression/movement_sparsity/importance_threshold": -0.00469195516234322, + "compression/movement_sparsity/linear_layer_sparsity": 0.20822302285853397, + "compression/movement_sparsity/model_sparsity": 0.20106992145440009, + "compression_loss": 35.48873519897461, + "distillation_loss": 0.4104093015193939, + "epoch": 2.38, + "learning_rate": 4.235465389311543e-05, + "loss": 36.2185, + "step": 2811, + "task_loss": 0.8773511648178101 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.3307245597012922, + "compression/movement_sparsity/importance_threshold": -0.004687427705117188, + "compression/movement_sparsity/linear_layer_sparsity": 0.20895203069528773, + "compression/movement_sparsity/model_sparsity": 0.20177388562927093, + "compression_loss": 35.55790710449219, + "distillation_loss": 1.0391801595687866, + "epoch": 2.38, + "learning_rate": 4.234995773457312e-05, + "loss": 36.1907, + "step": 2812, + "task_loss": 0.6424437761306763 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.33137057842145246, + "compression/movement_sparsity/importance_threshold": -0.004682903161312694, + "compression/movement_sparsity/linear_layer_sparsity": 0.20967874909185538, + "compression/movement_sparsity/model_sparsity": 0.20247563901326923, + "compression_loss": 35.62705993652344, + "distillation_loss": 0.31216877698898315, + "epoch": 2.38, + "learning_rate": 4.234526157603081e-05, + "loss": 36.0388, + "step": 2813, + "task_loss": 0.7739354372024536 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.33201618129436594, + "compression/movement_sparsity/importance_threshold": -0.004678381529992042, + "compression/movement_sparsity/linear_layer_sparsity": 0.2103461924511052, + "compression/movement_sparsity/model_sparsity": 0.20312015363983285, + "compression_loss": 35.69611358642578, + "distillation_loss": 0.5482176542282104, + "epoch": 2.38, + "learning_rate": 4.2340565417488496e-05, + "loss": 36.3657, + "step": 2814, + "task_loss": 0.32889318466186523 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.3326613684539179, + "compression/movement_sparsity/importance_threshold": -0.0046738628102175345, + "compression/movement_sparsity/linear_layer_sparsity": 0.21122898627785866, + "compression/movement_sparsity/model_sparsity": 0.2039726207828459, + "compression_loss": 35.7651252746582, + "distillation_loss": 0.739136815071106, + "epoch": 2.38, + "learning_rate": 4.233586925894618e-05, + "loss": 36.4543, + "step": 2815, + "task_loss": 0.7743887901306152 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.33330614003399306, + "compression/movement_sparsity/importance_threshold": -0.004669347001051479, + "compression/movement_sparsity/linear_layer_sparsity": 0.21187767292141726, + "compression/movement_sparsity/model_sparsity": 0.2045990230446047, + "compression_loss": 35.8340950012207, + "distillation_loss": 0.7866579294204712, + "epoch": 2.38, + "learning_rate": 4.233117310040387e-05, + "loss": 36.5839, + "step": 2816, + "task_loss": 0.9403125047683716 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.33395049616847705, + "compression/movement_sparsity/importance_threshold": -0.004664834101556176, + "compression/movement_sparsity/linear_layer_sparsity": 0.21260136257940543, + "compression/movement_sparsity/model_sparsity": 0.20529785173651116, + "compression_loss": 35.90303421020508, + "distillation_loss": 0.8537001609802246, + "epoch": 2.38, + "learning_rate": 4.232647694186156e-05, + "loss": 36.5099, + "step": 2817, + "task_loss": 0.6227242350578308 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.3345944369912549, + "compression/movement_sparsity/importance_threshold": -0.004660324110793932, + "compression/movement_sparsity/linear_layer_sparsity": 0.21326855553113486, + "compression/movement_sparsity/model_sparsity": 0.20594212455782313, + "compression_loss": 35.971900939941406, + "distillation_loss": 0.3539462089538574, + "epoch": 2.38, + "learning_rate": 4.232178078331925e-05, + "loss": 36.4416, + "step": 2818, + "task_loss": 0.09632567316293716 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.33523796263621175, + "compression/movement_sparsity/importance_threshold": -0.0046558170278270476, + "compression/movement_sparsity/linear_layer_sparsity": 0.21399220941662012, + "compression/movement_sparsity/model_sparsity": 0.2066409187061222, + "compression_loss": 36.04072952270508, + "distillation_loss": 0.3503519296646118, + "epoch": 2.38, + "learning_rate": 4.2317084624776934e-05, + "loss": 36.5733, + "step": 2819, + "task_loss": 0.28203102946281433 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.3358810732372326, + "compression/movement_sparsity/importance_threshold": -0.004651312851717829, + "compression/movement_sparsity/linear_layer_sparsity": 0.21466055901261025, + "compression/movement_sparsity/model_sparsity": 0.20728630843740622, + "compression_loss": 36.109493255615234, + "distillation_loss": 0.4840245842933655, + "epoch": 2.38, + "learning_rate": 4.231238846623462e-05, + "loss": 36.8159, + "step": 2820, + "task_loss": 1.1661646366119385 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.3365237689282028, + "compression/movement_sparsity/importance_threshold": -0.00464681158152858, + "compression/movement_sparsity/linear_layer_sparsity": 0.21542376536214364, + "compression/movement_sparsity/model_sparsity": 0.2080232963009358, + "compression_loss": 36.17823791503906, + "distillation_loss": 0.5955571532249451, + "epoch": 2.38, + "learning_rate": 4.230769230769231e-05, + "loss": 36.7818, + "step": 2821, + "task_loss": 0.8083208799362183 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.33716604984300724, + "compression/movement_sparsity/importance_threshold": -0.004642313216321605, + "compression/movement_sparsity/linear_layer_sparsity": 0.2162672452082017, + "compression/movement_sparsity/model_sparsity": 0.20883780001943422, + "compression_loss": 36.246891021728516, + "distillation_loss": 1.1211678981781006, + "epoch": 2.39, + "learning_rate": 4.230299614915e-05, + "loss": 37.1341, + "step": 2822, + "task_loss": 0.7611430287361145 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.3378079161155312, + "compression/movement_sparsity/importance_threshold": -0.004637817755159208, + "compression/movement_sparsity/linear_layer_sparsity": 0.21707049291265634, + "compression/movement_sparsity/model_sparsity": 0.20961345369416187, + "compression_loss": 36.31551742553711, + "distillation_loss": 0.6499782800674438, + "epoch": 2.39, + "learning_rate": 4.2298299990607687e-05, + "loss": 36.9344, + "step": 2823, + "task_loss": 1.131666898727417 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.33844936787965985, + "compression/movement_sparsity/importance_threshold": -0.0046333251971036925, + "compression/movement_sparsity/linear_layer_sparsity": 0.21771463644834566, + "compression/movement_sparsity/model_sparsity": 0.21023546891778297, + "compression_loss": 36.38413619995117, + "distillation_loss": 1.1920796632766724, + "epoch": 2.39, + "learning_rate": 4.229360383206537e-05, + "loss": 37.262, + "step": 2824, + "task_loss": 1.2239148616790771 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.33909040526927825, + "compression/movement_sparsity/importance_threshold": -0.004628835541217362, + "compression/movement_sparsity/linear_layer_sparsity": 0.2183505403841986, + "compression/movement_sparsity/model_sparsity": 0.21084952759717004, + "compression_loss": 36.45266342163086, + "distillation_loss": 0.4528842270374298, + "epoch": 2.39, + "learning_rate": 4.228890767352306e-05, + "loss": 36.9567, + "step": 2825, + "task_loss": 0.4460195302963257 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.3397310284182715, + "compression/movement_sparsity/importance_threshold": -0.004624348786562522, + "compression/movement_sparsity/linear_layer_sparsity": 0.2191508905159177, + "compression/movement_sparsity/model_sparsity": 0.21162238323969962, + "compression_loss": 36.52116012573242, + "distillation_loss": 0.3998104929924011, + "epoch": 2.39, + "learning_rate": 4.2284211514980746e-05, + "loss": 37.053, + "step": 2826, + "task_loss": 0.15969060361385345 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.34037123746052467, + "compression/movement_sparsity/importance_threshold": -0.004619864932201477, + "compression/movement_sparsity/linear_layer_sparsity": 0.21985911452848217, + "compression/movement_sparsity/model_sparsity": 0.2123062775786806, + "compression_loss": 36.589595794677734, + "distillation_loss": 0.5855236053466797, + "epoch": 2.39, + "learning_rate": 4.227951535643844e-05, + "loss": 37.2528, + "step": 2827, + "task_loss": 0.25490692257881165 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.34101103252992326, + "compression/movement_sparsity/importance_threshold": -0.0046153839771965275, + "compression/movement_sparsity/linear_layer_sparsity": 0.2205858210008822, + "compression/movement_sparsity/model_sparsity": 0.21300801944814307, + "compression_loss": 36.65797424316406, + "distillation_loss": 0.6798850893974304, + "epoch": 2.39, + "learning_rate": 4.227481919789612e-05, + "loss": 37.2153, + "step": 2828, + "task_loss": 0.4578976035118103 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.3416504137603522, + "compression/movement_sparsity/importance_threshold": -0.004610905920609979, + "compression/movement_sparsity/linear_layer_sparsity": 0.22136138078808637, + "compression/movement_sparsity/model_sparsity": 0.2137569363707558, + "compression_loss": 36.72627258300781, + "distillation_loss": 0.8175458908081055, + "epoch": 2.39, + "learning_rate": 4.227012303935381e-05, + "loss": 37.5156, + "step": 2829, + "task_loss": 1.8037407398223877 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.34228938128569664, + "compression/movement_sparsity/importance_threshold": -0.004606430761504136, + "compression/movement_sparsity/linear_layer_sparsity": 0.22223964343113822, + "compression/movement_sparsity/model_sparsity": 0.2146050279901669, + "compression_loss": 36.794559478759766, + "distillation_loss": 0.935619592666626, + "epoch": 2.39, + "learning_rate": 4.22654268808115e-05, + "loss": 37.5858, + "step": 2830, + "task_loss": 0.9907481670379639 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.3429279352398418, + "compression/movement_sparsity/importance_threshold": -0.004601958498941303, + "compression/movement_sparsity/linear_layer_sparsity": 0.22299996413210374, + "compression/movement_sparsity/model_sparsity": 0.21533922933603422, + "compression_loss": 36.8628044128418, + "distillation_loss": 0.6764949560165405, + "epoch": 2.39, + "learning_rate": 4.2260730722269184e-05, + "loss": 37.4213, + "step": 2831, + "task_loss": 0.14373371005058289 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.34356607575667253, + "compression/movement_sparsity/importance_threshold": -0.004597489131983783, + "compression/movement_sparsity/linear_layer_sparsity": 0.22376028483306928, + "compression/movement_sparsity/model_sparsity": 0.21607343068190152, + "compression_loss": 36.9310302734375, + "distillation_loss": 0.578451931476593, + "epoch": 2.39, + "learning_rate": 4.225603456372688e-05, + "loss": 37.7139, + "step": 2832, + "task_loss": 0.6982750296592712 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.34420380297007425, + "compression/movement_sparsity/importance_threshold": -0.004593022659693881, + "compression/movement_sparsity/linear_layer_sparsity": 0.2245329708958732, + "compression/movement_sparsity/model_sparsity": 0.21681957260138776, + "compression_loss": 36.99917221069336, + "distillation_loss": 0.44185081124305725, + "epoch": 2.39, + "learning_rate": 4.225133840518456e-05, + "loss": 37.578, + "step": 2833, + "task_loss": 0.9641563296318054 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.34484111701393205, + "compression/movement_sparsity/importance_threshold": -0.004588559081133899, + "compression/movement_sparsity/linear_layer_sparsity": 0.22528269101181045, + "compression/movement_sparsity/model_sparsity": 0.21754353752493372, + "compression_loss": 37.06732940673828, + "distillation_loss": 0.5886393189430237, + "epoch": 2.4, + "learning_rate": 4.224664224664225e-05, + "loss": 37.675, + "step": 2834, + "task_loss": 0.8013940453529358 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.3454780180221311, + "compression/movement_sparsity/importance_threshold": -0.0045840983953661435, + "compression/movement_sparsity/linear_layer_sparsity": 0.22605546054378783, + "compression/movement_sparsity/model_sparsity": 0.2182897600461705, + "compression_loss": 37.13538360595703, + "distillation_loss": 0.4786885976791382, + "epoch": 2.4, + "learning_rate": 4.2241946088099936e-05, + "loss": 37.6376, + "step": 2835, + "task_loss": 0.45392701029777527 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.3461145061285561, + "compression/movement_sparsity/importance_threshold": -0.004579640601452918, + "compression/movement_sparsity/linear_layer_sparsity": 0.22683406099373915, + "compression/movement_sparsity/model_sparsity": 0.21904161317541085, + "compression_loss": 37.20341491699219, + "distillation_loss": 0.45206597447395325, + "epoch": 2.4, + "learning_rate": 4.223724992955763e-05, + "loss": 37.7405, + "step": 2836, + "task_loss": 1.5867104530334473 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.34675058146709314, + "compression/movement_sparsity/importance_threshold": -0.004575185698456524, + "compression/movement_sparsity/linear_layer_sparsity": 0.22763996778757659, + "compression/movement_sparsity/model_sparsity": 0.2198198345916207, + "compression_loss": 37.27141571044922, + "distillation_loss": 0.5896062850952148, + "epoch": 2.4, + "learning_rate": 4.223255377101531e-05, + "loss": 37.8077, + "step": 2837, + "task_loss": 0.717898428440094 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.34738624417162633, + "compression/movement_sparsity/importance_threshold": -0.004570733685439269, + "compression/movement_sparsity/linear_layer_sparsity": 0.22829324523567499, + "compression/movement_sparsity/model_sparsity": 0.2204506699496604, + "compression_loss": 37.33934783935547, + "distillation_loss": 0.5407304167747498, + "epoch": 2.4, + "learning_rate": 4.2227857612472995e-05, + "loss": 38.1771, + "step": 2838, + "task_loss": 0.540482759475708 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.34802149437604146, + "compression/movement_sparsity/importance_threshold": -0.004566284561463454, + "compression/movement_sparsity/linear_layer_sparsity": 0.22905790633365997, + "compression/movement_sparsity/model_sparsity": 0.22118906258655693, + "compression_loss": 37.40721893310547, + "distillation_loss": 0.5764235854148865, + "epoch": 2.4, + "learning_rate": 4.222316145393069e-05, + "loss": 38.0594, + "step": 2839, + "task_loss": 0.8560652732849121 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.34865633221422343, + "compression/movement_sparsity/importance_threshold": -0.004561838325591386, + "compression/movement_sparsity/linear_layer_sparsity": 0.2299116528880525, + "compression/movement_sparsity/model_sparsity": 0.22201348032037446, + "compression_loss": 37.47505187988281, + "distillation_loss": 0.9417319893836975, + "epoch": 2.4, + "learning_rate": 4.2218465295388375e-05, + "loss": 38.2177, + "step": 2840, + "task_loss": 1.4386892318725586 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.3492907578200576, + "compression/movement_sparsity/importance_threshold": -0.004557394976885365, + "compression/movement_sparsity/linear_layer_sparsity": 0.23059410877435582, + "compression/movement_sparsity/model_sparsity": 0.22267249174750342, + "compression_loss": 37.54283142089844, + "distillation_loss": 0.6872855424880981, + "epoch": 2.4, + "learning_rate": 4.221376913684606e-05, + "loss": 38.3467, + "step": 2841, + "task_loss": 0.5796800851821899 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.34992477132742883, + "compression/movement_sparsity/importance_threshold": -0.004552954514407698, + "compression/movement_sparsity/linear_layer_sparsity": 0.23143056528567635, + "compression/movement_sparsity/model_sparsity": 0.22348021340441887, + "compression_loss": 37.610572814941406, + "distillation_loss": 0.789714515209198, + "epoch": 2.4, + "learning_rate": 4.220907297830375e-05, + "loss": 38.3226, + "step": 2842, + "task_loss": 0.7521236538887024 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.3505583728702224, + "compression/movement_sparsity/importance_threshold": -0.004548516937220688, + "compression/movement_sparsity/linear_layer_sparsity": 0.23222729047043417, + "compression/movement_sparsity/model_sparsity": 0.2242495686280669, + "compression_loss": 37.67828369140625, + "distillation_loss": 0.5410398840904236, + "epoch": 2.4, + "learning_rate": 4.220437681976144e-05, + "loss": 38.4518, + "step": 2843, + "task_loss": 0.43216603994369507 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.3511915625823233, + "compression/movement_sparsity/importance_threshold": -0.00454408224438664, + "compression/movement_sparsity/linear_layer_sparsity": 0.2330368222112329, + "compression/movement_sparsity/model_sparsity": 0.22503129046315826, + "compression_loss": 37.74591827392578, + "distillation_loss": 0.5213282704353333, + "epoch": 2.4, + "learning_rate": 4.219968066121913e-05, + "loss": 38.5246, + "step": 2844, + "task_loss": 0.3962607979774475 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.351824340597617, + "compression/movement_sparsity/importance_threshold": -0.004539650434967856, + "compression/movement_sparsity/linear_layer_sparsity": 0.23383051865741122, + "compression/movement_sparsity/model_sparsity": 0.22579772099471448, + "compression_loss": 37.81352996826172, + "distillation_loss": 0.7692157030105591, + "epoch": 2.4, + "learning_rate": 4.219498450267681e-05, + "loss": 38.4898, + "step": 2845, + "task_loss": 0.5137061476707458 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.3524567070499883, + "compression/movement_sparsity/importance_threshold": -0.0045352215080266415, + "compression/movement_sparsity/linear_layer_sparsity": 0.23455111995598169, + "compression/movement_sparsity/model_sparsity": 0.22649356742185017, + "compression_loss": 37.88108825683594, + "distillation_loss": 0.41877615451812744, + "epoch": 2.41, + "learning_rate": 4.21902883441345e-05, + "loss": 38.4618, + "step": 2846, + "task_loss": 0.28510376811027527 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.35308866207332235, + "compression/movement_sparsity/importance_threshold": -0.004530795462625302, + "compression/movement_sparsity/linear_layer_sparsity": 0.23518345856571152, + "compression/movement_sparsity/model_sparsity": 0.22710418325503468, + "compression_loss": 37.94862365722656, + "distillation_loss": 0.45660996437072754, + "epoch": 2.41, + "learning_rate": 4.2185592185592186e-05, + "loss": 38.5502, + "step": 2847, + "task_loss": 0.3508904278278351 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.35372020580150454, + "compression/movement_sparsity/importance_threshold": -0.004526372297826138, + "compression/movement_sparsity/linear_layer_sparsity": 0.23585833068139844, + "compression/movement_sparsity/model_sparsity": 0.2277558714373983, + "compression_loss": 38.016109466552734, + "distillation_loss": 0.8252172470092773, + "epoch": 2.41, + "learning_rate": 4.218089602704988e-05, + "loss": 38.8264, + "step": 2848, + "task_loss": 0.38833123445510864 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.3543513383684199, + "compression/movement_sparsity/importance_threshold": -0.004521952012691455, + "compression/movement_sparsity/linear_layer_sparsity": 0.23666479791111478, + "compression/movement_sparsity/model_sparsity": 0.22853463403679047, + "compression_loss": 38.083526611328125, + "distillation_loss": 0.6967899799346924, + "epoch": 2.41, + "learning_rate": 4.2176199868507565e-05, + "loss": 38.8926, + "step": 2849, + "task_loss": 0.9952340126037598 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.3549820599079536, + "compression/movement_sparsity/importance_threshold": -0.0045175346062835565, + "compression/movement_sparsity/linear_layer_sparsity": 0.23741293211275646, + "compression/movement_sparsity/model_sparsity": 0.22925706752707578, + "compression_loss": 38.15088653564453, + "distillation_loss": 0.9019254446029663, + "epoch": 2.41, + "learning_rate": 4.217150370996525e-05, + "loss": 38.9024, + "step": 2850, + "task_loss": 0.8501352667808533 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.35561237055399075, + "compression/movement_sparsity/importance_threshold": -0.004513120077664747, + "compression/movement_sparsity/linear_layer_sparsity": 0.23825929760738238, + "compression/movement_sparsity/model_sparsity": 0.23007435776323648, + "compression_loss": 38.21818923950195, + "distillation_loss": 0.9143953919410706, + "epoch": 2.41, + "learning_rate": 4.216680755142294e-05, + "loss": 38.8971, + "step": 2851, + "task_loss": 0.38667431473731995 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.35624227044041656, + "compression/movement_sparsity/importance_threshold": -0.00450870842589733, + "compression/movement_sparsity/linear_layer_sparsity": 0.23906649221132448, + "compression/movement_sparsity/model_sparsity": 0.23085382274931213, + "compression_loss": 38.285499572753906, + "distillation_loss": 0.9725979566574097, + "epoch": 2.41, + "learning_rate": 4.2162111392880624e-05, + "loss": 39.0366, + "step": 2852, + "task_loss": 0.5380443930625916 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.35687175970111595, + "compression/movement_sparsity/importance_threshold": -0.004504299650043611, + "compression/movement_sparsity/linear_layer_sparsity": 0.23979530926139608, + "compression/movement_sparsity/model_sparsity": 0.23155760269161024, + "compression_loss": 38.352752685546875, + "distillation_loss": 0.7909529805183411, + "epoch": 2.41, + "learning_rate": 4.215741523433832e-05, + "loss": 39.1394, + "step": 2853, + "task_loss": 0.7013994455337524 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.35750083846997427, + "compression/movement_sparsity/importance_threshold": -0.004499893749165892, + "compression/movement_sparsity/linear_layer_sparsity": 0.24057822626003156, + "compression/movement_sparsity/model_sparsity": 0.2323136240828082, + "compression_loss": 38.41997528076172, + "distillation_loss": 0.6894046068191528, + "epoch": 2.41, + "learning_rate": 4.2152719075796e-05, + "loss": 39.0867, + "step": 2854, + "task_loss": 0.43048420548439026 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.35812950688087664, + "compression/movement_sparsity/importance_threshold": -0.0044954907223264794, + "compression/movement_sparsity/linear_layer_sparsity": 0.24128690339096617, + "compression/movement_sparsity/model_sparsity": 0.23299795597414935, + "compression_loss": 38.487152099609375, + "distillation_loss": 0.8894556760787964, + "epoch": 2.41, + "learning_rate": 4.214802291725369e-05, + "loss": 39.3172, + "step": 2855, + "task_loss": 1.365706205368042 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.35875776506770773, + "compression/movement_sparsity/importance_threshold": -0.004491090568587677, + "compression/movement_sparsity/linear_layer_sparsity": 0.2420775830227326, + "compression/movement_sparsity/model_sparsity": 0.23376147332814956, + "compression_loss": 38.554290771484375, + "distillation_loss": 0.36945828795433044, + "epoch": 2.41, + "learning_rate": 4.2143326758711376e-05, + "loss": 39.319, + "step": 2856, + "task_loss": 0.11058413982391357 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.3593856131643538, + "compression/movement_sparsity/importance_threshold": -0.004486693287011782, + "compression/movement_sparsity/linear_layer_sparsity": 0.24279491709937087, + "compression/movement_sparsity/model_sparsity": 0.23445416477247755, + "compression_loss": 38.62138748168945, + "distillation_loss": 1.327958106994629, + "epoch": 2.41, + "learning_rate": 4.213863060016906e-05, + "loss": 39.6254, + "step": 2857, + "task_loss": 1.2029945850372314 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.36001305130469874, + "compression/movement_sparsity/importance_threshold": -0.004482298876661109, + "compression/movement_sparsity/linear_layer_sparsity": 0.24352433035782423, + "compression/movement_sparsity/model_sparsity": 0.2351585204415654, + "compression_loss": 38.688438415527344, + "distillation_loss": 0.4099147915840149, + "epoch": 2.42, + "learning_rate": 4.213393444162675e-05, + "loss": 39.3491, + "step": 2858, + "task_loss": 0.821457028388977 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.36064007962262845, + "compression/movement_sparsity/importance_threshold": -0.004477907336597956, + "compression/movement_sparsity/linear_layer_sparsity": 0.24410912731118986, + "compression/movement_sparsity/model_sparsity": 0.23572322782053706, + "compression_loss": 38.75544738769531, + "distillation_loss": 0.3898249864578247, + "epoch": 2.42, + "learning_rate": 4.2129238283084435e-05, + "loss": 39.5784, + "step": 2859, + "task_loss": 0.44446590542793274 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.36126669825202784, + "compression/movement_sparsity/importance_threshold": -0.004473518665884626, + "compression/movement_sparsity/linear_layer_sparsity": 0.2450224350829235, + "compression/movement_sparsity/model_sparsity": 0.23660516066064832, + "compression_loss": 38.822410583496094, + "distillation_loss": 0.4820502996444702, + "epoch": 2.42, + "learning_rate": 4.212454212454213e-05, + "loss": 39.4989, + "step": 2860, + "task_loss": 0.7827270030975342 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.36189290732678214, + "compression/movement_sparsity/importance_threshold": -0.004469132863583426, + "compression/movement_sparsity/linear_layer_sparsity": 0.24563429989682256, + "compression/movement_sparsity/model_sparsity": 0.2371960060358736, + "compression_loss": 38.88930130004883, + "distillation_loss": 0.8735820055007935, + "epoch": 2.42, + "learning_rate": 4.2119845965999815e-05, + "loss": 39.6787, + "step": 2861, + "task_loss": 1.5371885299682617 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.3625187069807764, + "compression/movement_sparsity/importance_threshold": -0.004464749928756659, + "compression/movement_sparsity/linear_layer_sparsity": 0.246373240565217, + "compression/movement_sparsity/model_sparsity": 0.2379095618190613, + "compression_loss": 38.956138610839844, + "distillation_loss": 1.015181541442871, + "epoch": 2.42, + "learning_rate": 4.21151498074575e-05, + "loss": 39.8934, + "step": 2862, + "task_loss": 1.1540307998657227 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.36314409734789577, + "compression/movement_sparsity/importance_threshold": -0.004460369860466627, + "compression/movement_sparsity/linear_layer_sparsity": 0.2471150668821793, + "compression/movement_sparsity/model_sparsity": 0.23862590411991128, + "compression_loss": 39.02297592163086, + "distillation_loss": 0.7358248829841614, + "epoch": 2.42, + "learning_rate": 4.211045364891519e-05, + "loss": 39.845, + "step": 2863, + "task_loss": 0.4448283314704895 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.36376907856202545, + "compression/movement_sparsity/importance_threshold": -0.004455992657775637, + "compression/movement_sparsity/linear_layer_sparsity": 0.24789999906514526, + "compression/movement_sparsity/model_sparsity": 0.23938387146765852, + "compression_loss": 39.089725494384766, + "distillation_loss": 0.5005629062652588, + "epoch": 2.42, + "learning_rate": 4.2105757490372874e-05, + "loss": 39.6118, + "step": 2864, + "task_loss": 0.2224845290184021 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.36439365075705055, + "compression/movement_sparsity/importance_threshold": -0.004451618319745991, + "compression/movement_sparsity/linear_layer_sparsity": 0.24883126463333852, + "compression/movement_sparsity/model_sparsity": 0.24028314519867636, + "compression_loss": 39.15641403198242, + "distillation_loss": 0.39794421195983887, + "epoch": 2.42, + "learning_rate": 4.210106133183057e-05, + "loss": 39.7211, + "step": 2865, + "task_loss": 0.3817918002605438 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.36501781406685607, + "compression/movement_sparsity/importance_threshold": -0.004447246845439996, + "compression/movement_sparsity/linear_layer_sparsity": 0.2496526489967673, + "compression/movement_sparsity/model_sparsity": 0.2410763124823475, + "compression_loss": 39.22306442260742, + "distillation_loss": 0.7675390243530273, + "epoch": 2.42, + "learning_rate": 4.209636517328825e-05, + "loss": 39.9514, + "step": 2866, + "task_loss": 1.4659264087677002 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.36564156862532715, + "compression/movement_sparsity/importance_threshold": -0.004442878233919953, + "compression/movement_sparsity/linear_layer_sparsity": 0.25043907170068774, + "compression/movement_sparsity/model_sparsity": 0.24183571914706906, + "compression_loss": 39.2896728515625, + "distillation_loss": 0.4020206034183502, + "epoch": 2.42, + "learning_rate": 4.209166901474594e-05, + "loss": 39.8924, + "step": 2867, + "task_loss": 0.13939334452152252 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.36626491456634935, + "compression/movement_sparsity/importance_threshold": -0.0044385124842481645, + "compression/movement_sparsity/linear_layer_sparsity": 0.25119482544544874, + "compression/movement_sparsity/model_sparsity": 0.24256551042572705, + "compression_loss": 39.35624694824219, + "distillation_loss": 0.7531265616416931, + "epoch": 2.42, + "learning_rate": 4.2086972856203626e-05, + "loss": 40.1112, + "step": 2868, + "task_loss": 0.6136288046836853 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.36688785202380747, + "compression/movement_sparsity/importance_threshold": -0.004434149595486938, + "compression/movement_sparsity/linear_layer_sparsity": 0.25192741053249323, + "compression/movement_sparsity/model_sparsity": 0.24327292896133626, + "compression_loss": 39.422752380371094, + "distillation_loss": 0.5470821857452393, + "epoch": 2.42, + "learning_rate": 4.208227669766132e-05, + "loss": 40.2009, + "step": 2869, + "task_loss": 0.7233137488365173 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.3675103811315866, + "compression/movement_sparsity/importance_threshold": -0.004429789566698576, + "compression/movement_sparsity/linear_layer_sparsity": 0.2528071636964996, + "compression/movement_sparsity/model_sparsity": 0.2441224598977217, + "compression_loss": 39.48924255371094, + "distillation_loss": 0.580055832862854, + "epoch": 2.43, + "learning_rate": 4.2077580539119006e-05, + "loss": 40.1623, + "step": 2870, + "task_loss": 0.3591819405555725 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.3681325020235722, + "compression/movement_sparsity/importance_threshold": -0.004425432396945382, + "compression/movement_sparsity/linear_layer_sparsity": 0.25380409565586354, + "compression/movement_sparsity/model_sparsity": 0.24508514417736005, + "compression_loss": 39.55565643310547, + "distillation_loss": 0.6337827444076538, + "epoch": 2.43, + "learning_rate": 4.2072884380576685e-05, + "loss": 40.4344, + "step": 2871, + "task_loss": 0.5271090865135193 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.368754214833649, + "compression/movement_sparsity/importance_threshold": -0.004421078085289661, + "compression/movement_sparsity/linear_layer_sparsity": 0.2546627072706515, + "compression/movement_sparsity/model_sparsity": 0.24591425984178175, + "compression_loss": 39.62202072143555, + "distillation_loss": 1.0839588642120361, + "epoch": 2.43, + "learning_rate": 4.206818822203438e-05, + "loss": 40.3394, + "step": 2872, + "task_loss": 0.9516077637672424 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.36937551969570237, + "compression/movement_sparsity/importance_threshold": -0.004416726630793717, + "compression/movement_sparsity/linear_layer_sparsity": 0.25527959215912527, + "compression/movement_sparsity/model_sparsity": 0.24650995283657653, + "compression_loss": 39.6883544921875, + "distillation_loss": 0.46678072214126587, + "epoch": 2.43, + "learning_rate": 4.2063492063492065e-05, + "loss": 40.1666, + "step": 2873, + "task_loss": 0.27631038427352905 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.3699964167436175, + "compression/movement_sparsity/importance_threshold": -0.004412378032519852, + "compression/movement_sparsity/linear_layer_sparsity": 0.2560888019473978, + "compression/movement_sparsity/model_sparsity": 0.24729136377920144, + "compression_loss": 39.75463104248047, + "distillation_loss": 0.5165824890136719, + "epoch": 2.43, + "learning_rate": 4.205879590494976e-05, + "loss": 40.4784, + "step": 2874, + "task_loss": 0.4173288345336914 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.3706169061112794, + "compression/movement_sparsity/importance_threshold": -0.004408032289530373, + "compression/movement_sparsity/linear_layer_sparsity": 0.25687739484982797, + "compression/movement_sparsity/model_sparsity": 0.2480528660894376, + "compression_loss": 39.82091522216797, + "distillation_loss": 0.9371733665466309, + "epoch": 2.43, + "learning_rate": 4.205409974640744e-05, + "loss": 40.69, + "step": 2875, + "task_loss": 1.7548267841339111 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.371236987932573, + "compression/movement_sparsity/importance_threshold": -0.004403689400887583, + "compression/movement_sparsity/linear_layer_sparsity": 0.25749401740661376, + "compression/movement_sparsity/model_sparsity": 0.2486483057644449, + "compression_loss": 39.88713836669922, + "distillation_loss": 0.9543690085411072, + "epoch": 2.43, + "learning_rate": 4.204940358786513e-05, + "loss": 40.6556, + "step": 2876, + "task_loss": 1.0020779371261597 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.37185666234138415, + "compression/movement_sparsity/importance_threshold": -0.004399349365653784, + "compression/movement_sparsity/linear_layer_sparsity": 0.2582421158357525, + "compression/movement_sparsity/model_sparsity": 0.2493707047111228, + "compression_loss": 39.95332336425781, + "distillation_loss": 0.8314875960350037, + "epoch": 2.43, + "learning_rate": 4.204470742932282e-05, + "loss": 40.5892, + "step": 2877, + "task_loss": 0.8611935973167419 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.37247592947159713, + "compression/movement_sparsity/importance_threshold": -0.004395012182891282, + "compression/movement_sparsity/linear_layer_sparsity": 0.2591518105846925, + "compression/movement_sparsity/model_sparsity": 0.2502491486468883, + "compression_loss": 40.019466400146484, + "distillation_loss": 0.7959657311439514, + "epoch": 2.43, + "learning_rate": 4.20400112707805e-05, + "loss": 40.7932, + "step": 2878, + "task_loss": 0.7210441827774048 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.3730947894570976, + "compression/movement_sparsity/importance_threshold": -0.00439067785166238, + "compression/movement_sparsity/linear_layer_sparsity": 0.2598087606764227, + "compression/movement_sparsity/model_sparsity": 0.2508835304819528, + "compression_loss": 40.085548400878906, + "distillation_loss": 0.7294604778289795, + "epoch": 2.43, + "learning_rate": 4.2035315112238196e-05, + "loss": 40.7666, + "step": 2879, + "task_loss": 0.4791008532047272 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.37371324243177073, + "compression/movement_sparsity/importance_threshold": -0.004386346371029384, + "compression/movement_sparsity/linear_layer_sparsity": 0.2606237894585016, + "compression/movement_sparsity/model_sparsity": 0.2516705605180454, + "compression_loss": 40.1515998840332, + "distillation_loss": 1.1352918148040771, + "epoch": 2.43, + "learning_rate": 4.2030618953695876e-05, + "loss": 40.9431, + "step": 2880, + "task_loss": 0.9499437808990479 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.3743312885295015, + "compression/movement_sparsity/importance_threshold": -0.004382017740054594, + "compression/movement_sparsity/linear_layer_sparsity": 0.26126811185670545, + "compression/movement_sparsity/model_sparsity": 0.25229274845970345, + "compression_loss": 40.217594146728516, + "distillation_loss": 0.9484983682632446, + "epoch": 2.44, + "learning_rate": 4.202592279515357e-05, + "loss": 40.8706, + "step": 2881, + "task_loss": 0.4032846689224243 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.374948927884175, + "compression/movement_sparsity/importance_threshold": -0.004377691957800318, + "compression/movement_sparsity/linear_layer_sparsity": 0.2619765743526226, + "compression/movement_sparsity/model_sparsity": 0.25297687308940026, + "compression_loss": 40.28354263305664, + "distillation_loss": 1.3353146314620972, + "epoch": 2.44, + "learning_rate": 4.2021226636611255e-05, + "loss": 41.0718, + "step": 2882, + "task_loss": 0.9825502634048462 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.3755661606296764, + "compression/movement_sparsity/importance_threshold": -0.004373369023328857, + "compression/movement_sparsity/linear_layer_sparsity": 0.26255964230168105, + "compression/movement_sparsity/model_sparsity": 0.2535399108606817, + "compression_loss": 40.349483489990234, + "distillation_loss": 0.6703431606292725, + "epoch": 2.44, + "learning_rate": 4.201653047806894e-05, + "loss": 41.0673, + "step": 2883, + "task_loss": 0.49749696254730225 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.37618298689989105, + "compression/movement_sparsity/importance_threshold": -0.004369048935702516, + "compression/movement_sparsity/linear_layer_sparsity": 0.2634891788655671, + "compression/movement_sparsity/model_sparsity": 0.25443751498400935, + "compression_loss": 40.41535568237305, + "distillation_loss": 0.7440531253814697, + "epoch": 2.44, + "learning_rate": 4.201183431952663e-05, + "loss": 41.077, + "step": 2884, + "task_loss": 0.36776742339134216 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.37679940682870383, + "compression/movement_sparsity/importance_threshold": -0.0043647316939836, + "compression/movement_sparsity/linear_layer_sparsity": 0.26421934334658154, + "compression/movement_sparsity/model_sparsity": 0.25514259606885226, + "compression_loss": 40.48122024536133, + "distillation_loss": 0.9638993740081787, + "epoch": 2.44, + "learning_rate": 4.2007138160984314e-05, + "loss": 41.3331, + "step": 2885, + "task_loss": 1.150522232055664 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.3774154205499999, + "compression/movement_sparsity/importance_threshold": -0.004360417297234413, + "compression/movement_sparsity/linear_layer_sparsity": 0.26499822997655614, + "compression/movement_sparsity/model_sparsity": 0.2558947255469517, + "compression_loss": 40.547027587890625, + "distillation_loss": 0.731335461139679, + "epoch": 2.44, + "learning_rate": 4.200244200244201e-05, + "loss": 41.4364, + "step": 2886, + "task_loss": 1.0942035913467407 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.3780310281976643, + "compression/movement_sparsity/importance_threshold": -0.004356105744517259, + "compression/movement_sparsity/linear_layer_sparsity": 0.2657790363975201, + "compression/movement_sparsity/model_sparsity": 0.256648708865314, + "compression_loss": 40.61281204223633, + "distillation_loss": 0.544540524482727, + "epoch": 2.44, + "learning_rate": 4.1997745843899694e-05, + "loss": 41.3138, + "step": 2887, + "task_loss": 0.4676317572593689 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.3786462299055826, + "compression/movement_sparsity/importance_threshold": -0.004351797034894441, + "compression/movement_sparsity/linear_layer_sparsity": 0.26668183897756653, + "compression/movement_sparsity/model_sparsity": 0.25752049739939026, + "compression_loss": 40.678524017333984, + "distillation_loss": 0.623945951461792, + "epoch": 2.44, + "learning_rate": 4.199304968535738e-05, + "loss": 41.5715, + "step": 2888, + "task_loss": 0.7430536150932312 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.37926102580763965, + "compression/movement_sparsity/importance_threshold": -0.0043474911674282616, + "compression/movement_sparsity/linear_layer_sparsity": 0.26752262396173887, + "compression/movement_sparsity/model_sparsity": 0.2583323988327991, + "compression_loss": 40.74421691894531, + "distillation_loss": 0.598175048828125, + "epoch": 2.44, + "learning_rate": 4.1988353526815066e-05, + "loss": 41.4824, + "step": 2889, + "task_loss": 0.9430882334709167 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.3798754160377207, + "compression/movement_sparsity/importance_threshold": -0.0043431881411810265, + "compression/movement_sparsity/linear_layer_sparsity": 0.2681773800066241, + "compression/movement_sparsity/model_sparsity": 0.2589646619932774, + "compression_loss": 40.80987548828125, + "distillation_loss": 0.5698367953300476, + "epoch": 2.44, + "learning_rate": 4.198365736827275e-05, + "loss": 41.6767, + "step": 2890, + "task_loss": 0.29367557168006897 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.3804894007297107, + "compression/movement_sparsity/importance_threshold": -0.00433888795521504, + "compression/movement_sparsity/linear_layer_sparsity": 0.2688874999618427, + "compression/movement_sparsity/model_sparsity": 0.2596503871434497, + "compression_loss": 40.87548065185547, + "distillation_loss": 0.5239613056182861, + "epoch": 2.44, + "learning_rate": 4.1978961209730446e-05, + "loss": 41.5483, + "step": 2891, + "task_loss": 0.12229099869728088 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.381102980017495, + "compression/movement_sparsity/importance_threshold": -0.004334590608592605, + "compression/movement_sparsity/linear_layer_sparsity": 0.2696505155246939, + "compression/movement_sparsity/model_sparsity": 0.26038719077440653, + "compression_loss": 40.9410400390625, + "distillation_loss": 0.6010409593582153, + "epoch": 2.44, + "learning_rate": 4.1974265051188125e-05, + "loss": 41.6898, + "step": 2892, + "task_loss": 0.6402194499969482 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.3817161540349585, + "compression/movement_sparsity/importance_threshold": -0.004330296100376028, + "compression/movement_sparsity/linear_layer_sparsity": 0.27049797804274234, + "compression/movement_sparsity/model_sparsity": 0.26120554034786037, + "compression_loss": 41.00656509399414, + "distillation_loss": 0.649442732334137, + "epoch": 2.45, + "learning_rate": 4.196956889264582e-05, + "loss": 41.7127, + "step": 2893, + "task_loss": 0.5247517228126526 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.38232892291598664, + "compression/movement_sparsity/importance_threshold": -0.004326004429627609, + "compression/movement_sparsity/linear_layer_sparsity": 0.27119722315707695, + "compression/movement_sparsity/model_sparsity": 0.261880764241388, + "compression_loss": 41.07201385498047, + "distillation_loss": 0.6194124817848206, + "epoch": 2.45, + "learning_rate": 4.1964872734103505e-05, + "loss": 41.7012, + "step": 2894, + "task_loss": 0.9719234704971313 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.3829412867944644, + "compression/movement_sparsity/importance_threshold": -0.004321715595409654, + "compression/movement_sparsity/linear_layer_sparsity": 0.2718854145680132, + "compression/movement_sparsity/model_sparsity": 0.26254531416023413, + "compression_loss": 41.137474060058594, + "distillation_loss": 1.0065691471099854, + "epoch": 2.45, + "learning_rate": 4.196017657556119e-05, + "loss": 41.9329, + "step": 2895, + "task_loss": 0.5961303114891052 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.38355324580427663, + "compression/movement_sparsity/importance_threshold": -0.00431742959678447, + "compression/movement_sparsity/linear_layer_sparsity": 0.2725802835128254, + "compression/movement_sparsity/model_sparsity": 0.2632163122191252, + "compression_loss": 41.20283889770508, + "distillation_loss": 0.47348302602767944, + "epoch": 2.45, + "learning_rate": 4.1955480417018884e-05, + "loss": 42.0792, + "step": 2896, + "task_loss": 0.5711849927902222 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.3841648000793093, + "compression/movement_sparsity/importance_threshold": -0.004313146432814354, + "compression/movement_sparsity/linear_layer_sparsity": 0.2733708319787479, + "compression/movement_sparsity/model_sparsity": 0.2639797029132317, + "compression_loss": 41.26814270019531, + "distillation_loss": 0.5595443248748779, + "epoch": 2.45, + "learning_rate": 4.1950784258476564e-05, + "loss": 42.0867, + "step": 2897, + "task_loss": 0.8683832883834839 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.3847759497534464, + "compression/movement_sparsity/importance_threshold": -0.004308866102561619, + "compression/movement_sparsity/linear_layer_sparsity": 0.2741195623887714, + "compression/movement_sparsity/model_sparsity": 0.26470271213030666, + "compression_loss": 41.33340835571289, + "distillation_loss": 1.0132503509521484, + "epoch": 2.45, + "learning_rate": 4.194608809993426e-05, + "loss": 42.2158, + "step": 2898, + "task_loss": 0.9445706605911255 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.3853866949605741, + "compression/movement_sparsity/importance_threshold": -0.00430458860508856, + "compression/movement_sparsity/linear_layer_sparsity": 0.2749316578256118, + "compression/movement_sparsity/model_sparsity": 0.2654869095905939, + "compression_loss": 41.39863586425781, + "distillation_loss": 1.0065538883209229, + "epoch": 2.45, + "learning_rate": 4.194139194139194e-05, + "loss": 42.1357, + "step": 2899, + "task_loss": 0.25425082445144653 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.385997035834577, + "compression/movement_sparsity/importance_threshold": -0.004300313939457486, + "compression/movement_sparsity/linear_layer_sparsity": 0.27566486296937337, + "compression/movement_sparsity/model_sparsity": 0.2661949268820644, + "compression_loss": 41.463802337646484, + "distillation_loss": 0.9346264600753784, + "epoch": 2.45, + "learning_rate": 4.1936695782849636e-05, + "loss": 42.4009, + "step": 2900, + "task_loss": 1.6883752346038818 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.3866069725093405, + "compression/movement_sparsity/importance_threshold": -0.0042960421047307, + "compression/movement_sparsity/linear_layer_sparsity": 0.2763852615570941, + "compression/movement_sparsity/model_sparsity": 0.2668905775620916, + "compression_loss": 41.528907775878906, + "distillation_loss": 0.5689643621444702, + "epoch": 2.45, + "learning_rate": 4.1931999624307316e-05, + "loss": 42.4614, + "step": 2901, + "task_loss": 1.0131381750106812 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.3872165051187495, + "compression/movement_sparsity/importance_threshold": -0.004291773099970505, + "compression/movement_sparsity/linear_layer_sparsity": 0.27716716500148053, + "compression/movement_sparsity/model_sparsity": 0.267645620217747, + "compression_loss": 41.59398651123047, + "distillation_loss": 1.53434157371521, + "epoch": 2.45, + "learning_rate": 4.1927303465765e-05, + "loss": 42.6286, + "step": 2902, + "task_loss": 1.245392084121704 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.38782563379668933, + "compression/movement_sparsity/importance_threshold": -0.004287506924239207, + "compression/movement_sparsity/linear_layer_sparsity": 0.27790628453238947, + "compression/movement_sparsity/model_sparsity": 0.2683593487189716, + "compression_loss": 41.65904235839844, + "distillation_loss": 0.7506531476974487, + "epoch": 2.45, + "learning_rate": 4.1922607307222695e-05, + "loss": 42.5519, + "step": 2903, + "task_loss": 0.9118168950080872 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.388434358677045, + "compression/movement_sparsity/importance_threshold": -0.004283243576599108, + "compression/movement_sparsity/linear_layer_sparsity": 0.2787032601246677, + "compression/movement_sparsity/model_sparsity": 0.26912894574787133, + "compression_loss": 41.724021911621094, + "distillation_loss": 0.6856245994567871, + "epoch": 2.45, + "learning_rate": 4.191791114868038e-05, + "loss": 42.5667, + "step": 2904, + "task_loss": 1.7196595668792725 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.3890426798937018, + "compression/movement_sparsity/importance_threshold": -0.004278983056112513, + "compression/movement_sparsity/linear_layer_sparsity": 0.2794917934062596, + "compression/movement_sparsity/model_sparsity": 0.2698903904854285, + "compression_loss": 41.78897476196289, + "distillation_loss": 0.47569721937179565, + "epoch": 2.46, + "learning_rate": 4.191321499013807e-05, + "loss": 42.3231, + "step": 2905, + "task_loss": 1.086851954460144 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.38965059758054466, + "compression/movement_sparsity/importance_threshold": -0.004274725361841727, + "compression/movement_sparsity/linear_layer_sparsity": 0.28022291182068493, + "compression/movement_sparsity/model_sparsity": 0.270596392733135, + "compression_loss": 41.8538932800293, + "distillation_loss": 0.693184494972229, + "epoch": 2.46, + "learning_rate": 4.1908518831595754e-05, + "loss": 42.5815, + "step": 2906, + "task_loss": 0.4588662385940552 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.39025811187145865, + "compression/movement_sparsity/importance_threshold": -0.004270470492849053, + "compression/movement_sparsity/linear_layer_sparsity": 0.28104732492269324, + "compression/movement_sparsity/model_sparsity": 0.271392484708898, + "compression_loss": 41.91870880126953, + "distillation_loss": 0.7055269479751587, + "epoch": 2.46, + "learning_rate": 4.190382267305345e-05, + "loss": 42.6951, + "step": 2907, + "task_loss": 0.9223252534866333 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.3908652229003293, + "compression/movement_sparsity/importance_threshold": -0.0042662184481967935, + "compression/movement_sparsity/linear_layer_sparsity": 0.2818362517018172, + "compression/movement_sparsity/model_sparsity": 0.2721543094261364, + "compression_loss": 41.983482360839844, + "distillation_loss": 0.6424552202224731, + "epoch": 2.46, + "learning_rate": 4.1899126514511134e-05, + "loss": 42.7729, + "step": 2908, + "task_loss": 1.3742283582687378 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.3914719308010416, + "compression/movement_sparsity/importance_threshold": -0.0042619692269472535, + "compression/movement_sparsity/linear_layer_sparsity": 0.28264981381127685, + "compression/movement_sparsity/model_sparsity": 0.2729399231743263, + "compression_loss": 42.0482292175293, + "distillation_loss": 0.6112089157104492, + "epoch": 2.46, + "learning_rate": 4.189443035596882e-05, + "loss": 42.9139, + "step": 2909, + "task_loss": 1.4950002431869507 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.39207823570748057, + "compression/movement_sparsity/importance_threshold": -0.004257722828162738, + "compression/movement_sparsity/linear_layer_sparsity": 0.2834723548189663, + "compression/movement_sparsity/model_sparsity": 0.27373420736796955, + "compression_loss": 42.11294937133789, + "distillation_loss": 1.268035650253296, + "epoch": 2.46, + "learning_rate": 4.1889734197426507e-05, + "loss": 43.043, + "step": 2910, + "task_loss": 1.302301049232483 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.3926841377535315, + "compression/movement_sparsity/importance_threshold": -0.004253479250905551, + "compression/movement_sparsity/linear_layer_sparsity": 0.28428825406528263, + "compression/movement_sparsity/model_sparsity": 0.2745220779651752, + "compression_loss": 42.17762756347656, + "distillation_loss": 0.5152312517166138, + "epoch": 2.46, + "learning_rate": 4.188503803888419e-05, + "loss": 42.8772, + "step": 2911, + "task_loss": 0.4991191625595093 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.3932896370730794, + "compression/movement_sparsity/importance_threshold": -0.004249238494237994, + "compression/movement_sparsity/linear_layer_sparsity": 0.28499360435344684, + "compression/movement_sparsity/model_sparsity": 0.27520319730102966, + "compression_loss": 42.242218017578125, + "distillation_loss": 0.2787705063819885, + "epoch": 2.46, + "learning_rate": 4.1880341880341886e-05, + "loss": 42.943, + "step": 2912, + "task_loss": 0.1897142082452774 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.3938947338000094, + "compression/movement_sparsity/importance_threshold": -0.004245000557222374, + "compression/movement_sparsity/linear_layer_sparsity": 0.28569287331611676, + "compression/movement_sparsity/model_sparsity": 0.2758784442236289, + "compression_loss": 42.306766510009766, + "distillation_loss": 1.042336106300354, + "epoch": 2.46, + "learning_rate": 4.187564572179957e-05, + "loss": 43.1683, + "step": 2913, + "task_loss": 0.349286824464798 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.3944994280682067, + "compression/movement_sparsity/importance_threshold": -0.004240765438920993, + "compression/movement_sparsity/linear_layer_sparsity": 0.2865569938963525, + "compression/movement_sparsity/model_sparsity": 0.2767128796035877, + "compression_loss": 42.371299743652344, + "distillation_loss": 0.5994269847869873, + "epoch": 2.46, + "learning_rate": 4.187094956325726e-05, + "loss": 43.3492, + "step": 2914, + "task_loss": 1.0650286674499512 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.3951037200115566, + "compression/movement_sparsity/importance_threshold": -0.0042365331383961554, + "compression/movement_sparsity/linear_layer_sparsity": 0.28742491828606404, + "compression/movement_sparsity/model_sparsity": 0.2775509881204649, + "compression_loss": 42.43577194213867, + "distillation_loss": 0.5641458034515381, + "epoch": 2.46, + "learning_rate": 4.1866253404714945e-05, + "loss": 43.0292, + "step": 2915, + "task_loss": 0.42542505264282227 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.3957076097639439, + "compression/movement_sparsity/importance_threshold": -0.004232303654710166, + "compression/movement_sparsity/linear_layer_sparsity": 0.28816385895445845, + "compression/movement_sparsity/model_sparsity": 0.27826454390365263, + "compression_loss": 42.50020980834961, + "distillation_loss": 0.6959583759307861, + "epoch": 2.46, + "learning_rate": 4.186155724617263e-05, + "loss": 43.1865, + "step": 2916, + "task_loss": 2.0493392944335938 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.39631109745925397, + "compression/movement_sparsity/importance_threshold": -0.004228076986925329, + "compression/movement_sparsity/linear_layer_sparsity": 0.28891790754324753, + "compression/movement_sparsity/model_sparsity": 0.27899268860369203, + "compression_loss": 42.56462478637695, + "distillation_loss": 0.5928775668144226, + "epoch": 2.47, + "learning_rate": 4.1856861087630324e-05, + "loss": 43.3091, + "step": 2917, + "task_loss": 0.6065967679023743 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.39691418323137173, + "compression/movement_sparsity/importance_threshold": -0.004223853134103946, + "compression/movement_sparsity/linear_layer_sparsity": 0.2898188380289752, + "compression/movement_sparsity/model_sparsity": 0.2798626693556485, + "compression_loss": 42.628990173339844, + "distillation_loss": 0.8544743061065674, + "epoch": 2.47, + "learning_rate": 4.1852164929088004e-05, + "loss": 43.5233, + "step": 2918, + "task_loss": 0.3194716274738312 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.39751686721418267, + "compression/movement_sparsity/importance_threshold": -0.004219632095308325, + "compression/movement_sparsity/linear_layer_sparsity": 0.29068480685519443, + "compression/movement_sparsity/model_sparsity": 0.28069888948865546, + "compression_loss": 42.69329833984375, + "distillation_loss": 0.6866007447242737, + "epoch": 2.47, + "learning_rate": 4.18474687705457e-05, + "loss": 43.7207, + "step": 2919, + "task_loss": 0.3574233949184418 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.3981191495415718, + "compression/movement_sparsity/importance_threshold": -0.004215413869600764, + "compression/movement_sparsity/linear_layer_sparsity": 0.2913518924894152, + "compression/movement_sparsity/model_sparsity": 0.2813430586791453, + "compression_loss": 42.75755310058594, + "distillation_loss": 1.5404736995697021, + "epoch": 2.47, + "learning_rate": 4.1842772612003383e-05, + "loss": 43.6753, + "step": 2920, + "task_loss": 0.2516445517539978 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.39872103034742423, + "compression/movement_sparsity/importance_threshold": -0.004211198456043572, + "compression/movement_sparsity/linear_layer_sparsity": 0.29219445417456524, + "compression/movement_sparsity/model_sparsity": 0.2821566757783875, + "compression_loss": 42.82181167602539, + "distillation_loss": 0.9382905960083008, + "epoch": 2.47, + "learning_rate": 4.183807645346107e-05, + "loss": 43.8622, + "step": 2921, + "task_loss": 0.8016681671142578 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.399322509765625, + "compression/movement_sparsity/importance_threshold": -0.004206985853699052, + "compression/movement_sparsity/linear_layer_sparsity": 0.2929272896691301, + "compression/movement_sparsity/model_sparsity": 0.2828643361192484, + "compression_loss": 42.88600158691406, + "distillation_loss": 1.0830966234207153, + "epoch": 2.47, + "learning_rate": 4.1833380294918756e-05, + "loss": 43.8048, + "step": 2922, + "task_loss": 1.0063364505767822 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.39992358793005933, + "compression/movement_sparsity/importance_threshold": -0.004202776061629507, + "compression/movement_sparsity/linear_layer_sparsity": 0.2936662064891893, + "compression/movement_sparsity/model_sparsity": 0.2835778688733645, + "compression_loss": 42.95016098022461, + "distillation_loss": 0.9263947010040283, + "epoch": 2.47, + "learning_rate": 4.182868413637644e-05, + "loss": 43.8085, + "step": 2923, + "task_loss": 1.1111057996749878 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.40052426497461247, + "compression/movement_sparsity/importance_threshold": -0.004198569078897242, + "compression/movement_sparsity/linear_layer_sparsity": 0.2945333915805074, + "compression/movement_sparsity/model_sparsity": 0.28441526348902246, + "compression_loss": 43.0142707824707, + "distillation_loss": 0.8823603391647339, + "epoch": 2.47, + "learning_rate": 4.1823987977834136e-05, + "loss": 43.784, + "step": 2924, + "task_loss": 1.5097572803497314 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.4011245410331694, + "compression/movement_sparsity/importance_threshold": -0.0041943649045645594, + "compression/movement_sparsity/linear_layer_sparsity": 0.2954618549693062, + "compression/movement_sparsity/model_sparsity": 0.2853118313041286, + "compression_loss": 43.07833480834961, + "distillation_loss": 0.7987208366394043, + "epoch": 2.47, + "learning_rate": 4.181929181929182e-05, + "loss": 43.8147, + "step": 2925, + "task_loss": 0.3470713496208191 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.4017244162396153, + "compression/movement_sparsity/importance_threshold": -0.004190163537693765, + "compression/movement_sparsity/linear_layer_sparsity": 0.2963320568750363, + "compression/movement_sparsity/model_sparsity": 0.2861521390973426, + "compression_loss": 43.14236068725586, + "distillation_loss": 0.9048537015914917, + "epoch": 2.47, + "learning_rate": 4.181459566074951e-05, + "loss": 43.8766, + "step": 2926, + "task_loss": 1.1825629472732544 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.40232389072783514, + "compression/movement_sparsity/importance_threshold": -0.004185964977347164, + "compression/movement_sparsity/linear_layer_sparsity": 0.2972232810883083, + "compression/movement_sparsity/model_sparsity": 0.2870127470171624, + "compression_loss": 43.206329345703125, + "distillation_loss": 1.0933799743652344, + "epoch": 2.47, + "learning_rate": 4.1809899502207195e-05, + "loss": 43.9441, + "step": 2927, + "task_loss": 1.2188332080841064 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.4029229646317144, + "compression/movement_sparsity/importance_threshold": -0.0041817692225870565, + "compression/movement_sparsity/linear_layer_sparsity": 0.29815454665650154, + "compression/movement_sparsity/model_sparsity": 0.28791202074818023, + "compression_loss": 43.27021408081055, + "distillation_loss": 0.7390118837356567, + "epoch": 2.47, + "learning_rate": 4.180520334366488e-05, + "loss": 43.9092, + "step": 2928, + "task_loss": 0.6102986335754395 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.4035216380851381, + "compression/movement_sparsity/importance_threshold": -0.004177576272475747, + "compression/movement_sparsity/linear_layer_sparsity": 0.2990434575812522, + "compression/movement_sparsity/model_sparsity": 0.28877039484805583, + "compression_loss": 43.33408737182617, + "distillation_loss": 0.5542397499084473, + "epoch": 2.48, + "learning_rate": 4.1800507185122574e-05, + "loss": 44.2212, + "step": 2929, + "task_loss": 0.7207878232002258 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.4041199112219914, + "compression/movement_sparsity/importance_threshold": -0.004173386126075542, + "compression/movement_sparsity/linear_layer_sparsity": 0.2998072124424969, + "compression/movement_sparsity/model_sparsity": 0.289507912380232, + "compression_loss": 43.39794158935547, + "distillation_loss": 0.3249339163303375, + "epoch": 2.48, + "learning_rate": 4.179581102658026e-05, + "loss": 44.0025, + "step": 2930, + "task_loss": 0.80772864818573 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.40471778417615933, + "compression/movement_sparsity/importance_threshold": -0.0041691987824487445, + "compression/movement_sparsity/linear_layer_sparsity": 0.3005536891848372, + "compression/movement_sparsity/model_sparsity": 0.29022874535004184, + "compression_loss": 43.46168518066406, + "distillation_loss": 0.5256613492965698, + "epoch": 2.48, + "learning_rate": 4.179111486803795e-05, + "loss": 44.3212, + "step": 2931, + "task_loss": 0.5185351371765137 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.40531525708152716, + "compression/movement_sparsity/importance_threshold": -0.004165014240657658, + "compression/movement_sparsity/linear_layer_sparsity": 0.3014118953779255, + "compression/movement_sparsity/model_sparsity": 0.2910574695202465, + "compression_loss": 43.52542495727539, + "distillation_loss": 0.839435338973999, + "epoch": 2.48, + "learning_rate": 4.178641870949563e-05, + "loss": 44.4701, + "step": 2932, + "task_loss": 1.6290571689605713 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.4059123300719799, + "compression/movement_sparsity/importance_threshold": -0.004160832499764586, + "compression/movement_sparsity/linear_layer_sparsity": 0.3021416782855756, + "compression/movement_sparsity/model_sparsity": 0.291762182139944, + "compression_loss": 43.58913040161133, + "distillation_loss": 0.5258724689483643, + "epoch": 2.48, + "learning_rate": 4.178172255095332e-05, + "loss": 44.2958, + "step": 2933, + "task_loss": 0.663270890712738 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.4065090032814027, + "compression/movement_sparsity/importance_threshold": -0.004156653558831835, + "compression/movement_sparsity/linear_layer_sparsity": 0.30288673605196725, + "compression/movement_sparsity/model_sparsity": 0.2924816448799943, + "compression_loss": 43.652732849121094, + "distillation_loss": 0.6036491394042969, + "epoch": 2.48, + "learning_rate": 4.177702639241101e-05, + "loss": 44.465, + "step": 2934, + "task_loss": 0.5574185848236084 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.4071052768436808, + "compression/movement_sparsity/importance_threshold": -0.004152477416921705, + "compression/movement_sparsity/linear_layer_sparsity": 0.30364587626033684, + "compression/movement_sparsity/model_sparsity": 0.29321470628681795, + "compression_loss": 43.71631622314453, + "distillation_loss": 0.33077341318130493, + "epoch": 2.48, + "learning_rate": 4.177233023386869e-05, + "loss": 44.3507, + "step": 2935, + "task_loss": 0.03658715635538101 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.4077011508926993, + "compression/movement_sparsity/importance_threshold": -0.004148304073096505, + "compression/movement_sparsity/linear_layer_sparsity": 0.30454377800748494, + "compression/movement_sparsity/model_sparsity": 0.29408176234668265, + "compression_loss": 43.7798957824707, + "distillation_loss": 0.5207771062850952, + "epoch": 2.48, + "learning_rate": 4.1767634075326385e-05, + "loss": 44.4831, + "step": 2936, + "task_loss": 0.5050748586654663 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.40829662556234336, + "compression/movement_sparsity/importance_threshold": -0.004144133526418533, + "compression/movement_sparsity/linear_layer_sparsity": 0.30534378233834264, + "compression/movement_sparsity/model_sparsity": 0.2948542840676742, + "compression_loss": 43.84335708618164, + "distillation_loss": 0.8236490488052368, + "epoch": 2.48, + "learning_rate": 4.176293791678407e-05, + "loss": 44.8155, + "step": 2937, + "task_loss": 0.5536699891090393 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.40889170098649785, + "compression/movement_sparsity/importance_threshold": -0.0041399657759501, + "compression/movement_sparsity/linear_layer_sparsity": 0.30613796767539403, + "compression/movement_sparsity/model_sparsity": 0.295621186695198, + "compression_loss": 43.90678405761719, + "distillation_loss": 0.6964631080627441, + "epoch": 2.48, + "learning_rate": 4.1758241758241765e-05, + "loss": 44.7836, + "step": 2938, + "task_loss": 0.21174968779087067 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.40948637729904847, + "compression/movement_sparsity/importance_threshold": -0.004135800820753504, + "compression/movement_sparsity/linear_layer_sparsity": 0.30702517344417274, + "compression/movement_sparsity/model_sparsity": 0.29647791421645503, + "compression_loss": 43.97017288208008, + "distillation_loss": 1.527217149734497, + "epoch": 2.48, + "learning_rate": 4.1753545599699444e-05, + "loss": 45.2043, + "step": 2939, + "task_loss": 1.802750825881958 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.4100806546338799, + "compression/movement_sparsity/importance_threshold": -0.0041316386598910515, + "compression/movement_sparsity/linear_layer_sparsity": 0.3079654298347635, + "compression/movement_sparsity/model_sparsity": 0.29738586990746196, + "compression_loss": 44.033512115478516, + "distillation_loss": 0.7779862880706787, + "epoch": 2.48, + "learning_rate": 4.174884944115714e-05, + "loss": 44.942, + "step": 2940, + "task_loss": 0.6638805270195007 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.41067453312487745, + "compression/movement_sparsity/importance_threshold": -0.004127479292425045, + "compression/movement_sparsity/linear_layer_sparsity": 0.3086950577282343, + "compression/movement_sparsity/model_sparsity": 0.2980904328381941, + "compression_loss": 44.09684371948242, + "distillation_loss": 0.5371556282043457, + "epoch": 2.49, + "learning_rate": 4.1744153282614824e-05, + "loss": 44.7625, + "step": 2941, + "task_loss": 1.0014811754226685 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.4112680129059263, + "compression/movement_sparsity/importance_threshold": -0.0041233227174177906, + "compression/movement_sparsity/linear_layer_sparsity": 0.30942293276906263, + "compression/movement_sparsity/model_sparsity": 0.29879330313216446, + "compression_loss": 44.16010284423828, + "distillation_loss": 0.9129904508590698, + "epoch": 2.49, + "learning_rate": 4.173945712407251e-05, + "loss": 44.8287, + "step": 2942, + "task_loss": 0.8579537272453308 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.41186109411091154, + "compression/movement_sparsity/importance_threshold": -0.0041191689339315895, + "compression/movement_sparsity/linear_layer_sparsity": 0.3103906266693835, + "compression/movement_sparsity/model_sparsity": 0.29972775377003463, + "compression_loss": 44.22334289550781, + "distillation_loss": 1.5290093421936035, + "epoch": 2.49, + "learning_rate": 4.17347609655302e-05, + "loss": 45.246, + "step": 2943, + "task_loss": 1.0454132556915283 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.4124537768737182, + "compression/movement_sparsity/importance_threshold": -0.004115017941028749, + "compression/movement_sparsity/linear_layer_sparsity": 0.3111751772789851, + "compression/movement_sparsity/model_sparsity": 0.30048535265263643, + "compression_loss": 44.2865104675293, + "distillation_loss": 0.5292792320251465, + "epoch": 2.49, + "learning_rate": 4.173006480698788e-05, + "loss": 44.9714, + "step": 2944, + "task_loss": 0.6021685004234314 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.41304606132823163, + "compression/movement_sparsity/importance_threshold": -0.004110869737771571, + "compression/movement_sparsity/linear_layer_sparsity": 0.3120837630803349, + "compression/movement_sparsity/model_sparsity": 0.3013627257365731, + "compression_loss": 44.34966278076172, + "distillation_loss": 1.10421621799469, + "epoch": 2.49, + "learning_rate": 4.1725368648445576e-05, + "loss": 45.1226, + "step": 2945, + "task_loss": 0.9707024097442627 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.4136379476083367, + "compression/movement_sparsity/importance_threshold": -0.004106724323222361, + "compression/movement_sparsity/linear_layer_sparsity": 0.31276450188649874, + "compression/movement_sparsity/model_sparsity": 0.30202007907054756, + "compression_loss": 44.412776947021484, + "distillation_loss": 0.32868367433547974, + "epoch": 2.49, + "learning_rate": 4.172067248990326e-05, + "loss": 45.065, + "step": 2946, + "task_loss": 0.6558057069778442 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.4142294358479186, + "compression/movement_sparsity/importance_threshold": -0.004102581696443423, + "compression/movement_sparsity/linear_layer_sparsity": 0.31346224455571126, + "compression/movement_sparsity/model_sparsity": 0.30269385213256517, + "compression_loss": 44.475826263427734, + "distillation_loss": 0.8931636214256287, + "epoch": 2.49, + "learning_rate": 4.171597633136095e-05, + "loss": 45.4207, + "step": 2947, + "task_loss": 1.1884026527404785 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.41482052618086307, + "compression/movement_sparsity/importance_threshold": -0.004098441856497056, + "compression/movement_sparsity/linear_layer_sparsity": 0.3142285273404947, + "compression/movement_sparsity/model_sparsity": 0.30343381074632975, + "compression_loss": 44.53882598876953, + "distillation_loss": 0.6348766088485718, + "epoch": 2.49, + "learning_rate": 4.1711280172818635e-05, + "loss": 45.4906, + "step": 2948, + "task_loss": 0.3994178771972656 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.41541121874105424, + "compression/movement_sparsity/importance_threshold": -0.004094304802445572, + "compression/movement_sparsity/linear_layer_sparsity": 0.3150193500622728, + "compression/movement_sparsity/model_sparsity": 0.30419746627475946, + "compression_loss": 44.60183334350586, + "distillation_loss": 0.6538788080215454, + "epoch": 2.49, + "learning_rate": 4.170658401427632e-05, + "loss": 45.5572, + "step": 2949, + "task_loss": 0.8999935984611511 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.41600151366237803, + "compression/movement_sparsity/importance_threshold": -0.004090170533351268, + "compression/movement_sparsity/linear_layer_sparsity": 0.31585925265804005, + "compression/movement_sparsity/model_sparsity": 0.3050085156325195, + "compression_loss": 44.664798736572266, + "distillation_loss": 1.3343597650527954, + "epoch": 2.49, + "learning_rate": 4.1701887855734014e-05, + "loss": 45.5255, + "step": 2950, + "task_loss": 1.1742417812347412 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.41659141107871933, + "compression/movement_sparsity/importance_threshold": -0.004086039048276452, + "compression/movement_sparsity/linear_layer_sparsity": 0.31663964173313675, + "compression/movement_sparsity/model_sparsity": 0.30576209594212905, + "compression_loss": 44.727718353271484, + "distillation_loss": 1.0035552978515625, + "epoch": 2.49, + "learning_rate": 4.16971916971917e-05, + "loss": 45.6523, + "step": 2951, + "task_loss": 1.1468698978424072 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.41718091112396327, + "compression/movement_sparsity/importance_threshold": -0.004081910346283427, + "compression/movement_sparsity/linear_layer_sparsity": 0.31743984877484427, + "compression/movement_sparsity/model_sparsity": 0.3065348134102291, + "compression_loss": 44.790584564208984, + "distillation_loss": 1.2424607276916504, + "epoch": 2.5, + "learning_rate": 4.169249553864939e-05, + "loss": 45.6839, + "step": 2952, + "task_loss": 0.9031699895858765 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.417770013931995, + "compression/movement_sparsity/importance_threshold": -0.004077784426434496, + "compression/movement_sparsity/linear_layer_sparsity": 0.318247448800486, + "compression/movement_sparsity/model_sparsity": 0.30731466989052175, + "compression_loss": 44.85344696044922, + "distillation_loss": 0.9061406850814819, + "epoch": 2.5, + "learning_rate": 4.168779938010707e-05, + "loss": 45.7166, + "step": 2953, + "task_loss": 0.27855953574180603 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.41835871963669957, + "compression/movement_sparsity/importance_threshold": -0.004073661287791965, + "compression/movement_sparsity/linear_layer_sparsity": 0.3190533555943234, + "compression/movement_sparsity/model_sparsity": 0.30809289130673156, + "compression_loss": 44.916255950927734, + "distillation_loss": 0.6621519327163696, + "epoch": 2.5, + "learning_rate": 4.168310322156476e-05, + "loss": 45.7283, + "step": 2954, + "task_loss": 0.290755033493042 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.4189470283719622, + "compression/movement_sparsity/importance_threshold": -0.004069540929418136, + "compression/movement_sparsity/linear_layer_sparsity": 0.3198381089147748, + "compression/movement_sparsity/model_sparsity": 0.3088506859364419, + "compression_loss": 44.97902297973633, + "distillation_loss": 0.8518170714378357, + "epoch": 2.5, + "learning_rate": 4.167840706302245e-05, + "loss": 45.7434, + "step": 2955, + "task_loss": 2.3175394535064697 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.41953494027166816, + "compression/movement_sparsity/importance_threshold": -0.004065423350375314, + "compression/movement_sparsity/linear_layer_sparsity": 0.3206730272084703, + "compression/movement_sparsity/model_sparsity": 0.30965692221823987, + "compression_loss": 45.04173278808594, + "distillation_loss": 1.1798666715621948, + "epoch": 2.5, + "learning_rate": 4.167371090448014e-05, + "loss": 46.1728, + "step": 2956, + "task_loss": 0.5762167572975159 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.4201224554697024, + "compression/movement_sparsity/importance_threshold": -0.004061308549725803, + "compression/movement_sparsity/linear_layer_sparsity": 0.3216044954875134, + "compression/movement_sparsity/model_sparsity": 0.31055639169636623, + "compression_loss": 45.10441970825195, + "distillation_loss": 1.5197710990905762, + "epoch": 2.5, + "learning_rate": 4.1669014745937825e-05, + "loss": 46.2245, + "step": 2957, + "task_loss": 1.622458577156067 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.4207095740999499, + "compression/movement_sparsity/importance_threshold": -0.004057196526531907, + "compression/movement_sparsity/linear_layer_sparsity": 0.32241366950328304, + "compression/movement_sparsity/model_sparsity": 0.3113377680953837, + "compression_loss": 45.16708755493164, + "distillation_loss": 0.8017421364784241, + "epoch": 2.5, + "learning_rate": 4.166431858739551e-05, + "loss": 45.9341, + "step": 2958, + "task_loss": 0.7219691276550293 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.42129629629629617, + "compression/movement_sparsity/importance_threshold": -0.0040530872798559305, + "compression/movement_sparsity/linear_layer_sparsity": 0.32323511348755, + "compression/movement_sparsity/model_sparsity": 0.31213099295173385, + "compression_loss": 45.229671478271484, + "distillation_loss": 0.8418365120887756, + "epoch": 2.5, + "learning_rate": 4.16596224288532e-05, + "loss": 46.0451, + "step": 2959, + "task_loss": 0.9927123188972473 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.4218826221926263, + "compression/movement_sparsity/importance_threshold": -0.004048980808760175, + "compression/movement_sparsity/linear_layer_sparsity": 0.3240397443954504, + "compression/movement_sparsity/model_sparsity": 0.3129079823126137, + "compression_loss": 45.29218292236328, + "distillation_loss": 0.5655040144920349, + "epoch": 2.5, + "learning_rate": 4.165492627031089e-05, + "loss": 46.3866, + "step": 2960, + "task_loss": 0.5054582953453064 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.4224685519228254, + "compression/movement_sparsity/importance_threshold": -0.0040448771123069455, + "compression/movement_sparsity/linear_layer_sparsity": 0.3248579330819528, + "compression/movement_sparsity/model_sparsity": 0.3136980637006919, + "compression_loss": 45.354671478271484, + "distillation_loss": 1.1326653957366943, + "epoch": 2.5, + "learning_rate": 4.165023011176857e-05, + "loss": 46.2157, + "step": 2961, + "task_loss": 1.6219327449798584 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.4230540856207784, + "compression/movement_sparsity/importance_threshold": -0.004040776189558549, + "compression/movement_sparsity/linear_layer_sparsity": 0.32563642621439537, + "compression/movement_sparsity/model_sparsity": 0.3144498131991101, + "compression_loss": 45.4171257019043, + "distillation_loss": 0.9375326633453369, + "epoch": 2.5, + "learning_rate": 4.1645533953226264e-05, + "loss": 46.2661, + "step": 2962, + "task_loss": 1.1186583042144775 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.42363922342037075, + "compression/movement_sparsity/importance_threshold": -0.004036678039577284, + "compression/movement_sparsity/linear_layer_sparsity": 0.32643619206190033, + "compression/movement_sparsity/model_sparsity": 0.3152221046293857, + "compression_loss": 45.479515075683594, + "distillation_loss": 0.5558945536613464, + "epoch": 2.5, + "learning_rate": 4.164083779468395e-05, + "loss": 46.2633, + "step": 2963, + "task_loss": 0.5486981272697449 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.42422396545548735, + "compression/movement_sparsity/importance_threshold": -0.004032582661425459, + "compression/movement_sparsity/linear_layer_sparsity": 0.32721348085341173, + "compression/movement_sparsity/model_sparsity": 0.3159726911596887, + "compression_loss": 45.5418586730957, + "distillation_loss": 1.4929382801055908, + "epoch": 2.51, + "learning_rate": 4.1636141636141643e-05, + "loss": 46.6576, + "step": 2964, + "task_loss": 0.8191894888877869 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.42480831186001333, + "compression/movement_sparsity/importance_threshold": -0.004028490054165378, + "compression/movement_sparsity/linear_layer_sparsity": 0.32807930658961937, + "compression/movement_sparsity/model_sparsity": 0.31680877311826605, + "compression_loss": 45.604190826416016, + "distillation_loss": 0.7607845664024353, + "epoch": 2.51, + "learning_rate": 4.163144547759932e-05, + "loss": 46.6781, + "step": 2965, + "task_loss": 0.5233112573623657 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.42539226276783404, + "compression/movement_sparsity/importance_threshold": -0.004024400216859342, + "compression/movement_sparsity/linear_layer_sparsity": 0.32893274311565335, + "compression/movement_sparsity/model_sparsity": 0.3176328914741529, + "compression_loss": 45.6664924621582, + "distillation_loss": 0.8617205023765564, + "epoch": 2.51, + "learning_rate": 4.162674931905701e-05, + "loss": 46.4778, + "step": 2966, + "task_loss": 0.9161517024040222 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.4259758183128345, + "compression/movement_sparsity/importance_threshold": -0.004020313148569658, + "compression/movement_sparsity/linear_layer_sparsity": 0.32969610447936604, + "compression/movement_sparsity/model_sparsity": 0.31837002902664785, + "compression_loss": 45.72873306274414, + "distillation_loss": 0.5909499526023865, + "epoch": 2.51, + "learning_rate": 4.16220531605147e-05, + "loss": 46.8295, + "step": 2967, + "task_loss": 0.954791784286499 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.4265589786288996, + "compression/movement_sparsity/importance_threshold": -0.004016228848358628, + "compression/movement_sparsity/linear_layer_sparsity": 0.3304533964417521, + "compression/movement_sparsity/model_sparsity": 0.31910130568042333, + "compression_loss": 45.79096221923828, + "distillation_loss": 0.7730787992477417, + "epoch": 2.51, + "learning_rate": 4.161735700197239e-05, + "loss": 46.854, + "step": 2968, + "task_loss": 1.202275037765503 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.427141743849915, + "compression/movement_sparsity/importance_threshold": -0.004012147315288556, + "compression/movement_sparsity/linear_layer_sparsity": 0.3312154222986895, + "compression/movement_sparsity/model_sparsity": 0.3198371536049093, + "compression_loss": 45.85311508178711, + "distillation_loss": 0.4474985897541046, + "epoch": 2.51, + "learning_rate": 4.1612660843430075e-05, + "loss": 46.703, + "step": 2969, + "task_loss": 0.27049994468688965 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.4277241141097655, + "compression/movement_sparsity/importance_threshold": -0.004008068548421747, + "compression/movement_sparsity/linear_layer_sparsity": 0.33205509833527175, + "compression/movement_sparsity/model_sparsity": 0.3206479841864892, + "compression_loss": 45.915199279785156, + "distillation_loss": 1.236438274383545, + "epoch": 2.51, + "learning_rate": 4.160796468488776e-05, + "loss": 46.9184, + "step": 2970, + "task_loss": 1.1832753419876099 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.4283060895423363, + "compression/movement_sparsity/importance_threshold": -0.004003992546820505, + "compression/movement_sparsity/linear_layer_sparsity": 0.3327313178819015, + "compression/movement_sparsity/model_sparsity": 0.32130097351139764, + "compression_loss": 45.97730255126953, + "distillation_loss": 0.5596473217010498, + "epoch": 2.51, + "learning_rate": 4.1603268526345455e-05, + "loss": 46.8051, + "step": 2971, + "task_loss": 0.4584386944770813 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.42888767028151265, + "compression/movement_sparsity/importance_threshold": -0.003999919309547132, + "compression/movement_sparsity/linear_layer_sparsity": 0.3336013528492846, + "compression/movement_sparsity/model_sparsity": 0.3221411201011105, + "compression_loss": 46.03932571411133, + "distillation_loss": 1.1359453201293945, + "epoch": 2.51, + "learning_rate": 4.159857236780314e-05, + "loss": 47.2822, + "step": 2972, + "task_loss": 0.8737763166427612 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.4294688564611795, + "compression/movement_sparsity/importance_threshold": -0.003995848835663934, + "compression/movement_sparsity/linear_layer_sparsity": 0.33431665981742476, + "compression/movement_sparsity/model_sparsity": 0.32283185407435344, + "compression_loss": 46.101314544677734, + "distillation_loss": 0.8587710857391357, + "epoch": 2.51, + "learning_rate": 4.159387620926083e-05, + "loss": 47.157, + "step": 2973, + "task_loss": 0.6843359470367432 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.4300496482152222, + "compression/movement_sparsity/importance_threshold": -0.003991781124233214, + "compression/movement_sparsity/linear_layer_sparsity": 0.3350952841157114, + "compression/movement_sparsity/model_sparsity": 0.32358373023266535, + "compression_loss": 46.163265228271484, + "distillation_loss": 1.0358749628067017, + "epoch": 2.51, + "learning_rate": 4.1589180050718514e-05, + "loss": 47.0823, + "step": 2974, + "task_loss": 0.6716317534446716 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.4306300456775256, + "compression/movement_sparsity/importance_threshold": -0.003987716174317276, + "compression/movement_sparsity/linear_layer_sparsity": 0.3358344632674585, + "compression/movement_sparsity/model_sparsity": 0.32429751630656894, + "compression_loss": 46.22522735595703, + "distillation_loss": 0.8303221464157104, + "epoch": 2.51, + "learning_rate": 4.15844838921762e-05, + "loss": 47.1971, + "step": 2975, + "task_loss": 1.596840262413025 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.4312100489819751, + "compression/movement_sparsity/importance_threshold": -0.003983653984978425, + "compression/movement_sparsity/linear_layer_sparsity": 0.33653064387071074, + "compression/movement_sparsity/model_sparsity": 0.32496978096439744, + "compression_loss": 46.28711700439453, + "distillation_loss": 1.0439051389694214, + "epoch": 2.52, + "learning_rate": 4.157978773363389e-05, + "loss": 47.2883, + "step": 2976, + "task_loss": 0.7901548743247986 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.43178965826245586, + "compression/movement_sparsity/importance_threshold": -0.003979594555278965, + "compression/movement_sparsity/linear_layer_sparsity": 0.3372892832640396, + "compression/movement_sparsity/model_sparsity": 0.3257023587607177, + "compression_loss": 46.348941802978516, + "distillation_loss": 1.0318669080734253, + "epoch": 2.52, + "learning_rate": 4.157509157509158e-05, + "loss": 47.4219, + "step": 2977, + "task_loss": 0.3478185832500458 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.4323688736528526, + "compression/movement_sparsity/importance_threshold": -0.0039755378842812, + "compression/movement_sparsity/linear_layer_sparsity": 0.3379916286619596, + "compression/movement_sparsity/model_sparsity": 0.32638057643355195, + "compression_loss": 46.41075897216797, + "distillation_loss": 2.2380435466766357, + "epoch": 2.52, + "learning_rate": 4.1570395416549266e-05, + "loss": 47.5066, + "step": 2978, + "task_loss": 1.533159852027893 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.432947695287051, + "compression/movement_sparsity/importance_threshold": -0.003971483971047432, + "compression/movement_sparsity/linear_layer_sparsity": 0.33872290209056416, + "compression/movement_sparsity/model_sparsity": 0.32708672837022373, + "compression_loss": 46.472537994384766, + "distillation_loss": 1.2040711641311646, + "epoch": 2.52, + "learning_rate": 4.156569925800695e-05, + "loss": 47.4991, + "step": 2979, + "task_loss": 0.7526586055755615 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.43352612329893603, + "compression/movement_sparsity/importance_threshold": -0.0039674328146399665, + "compression/movement_sparsity/linear_layer_sparsity": 0.3395400772228175, + "compression/movement_sparsity/model_sparsity": 0.3278758310227594, + "compression_loss": 46.53425598144531, + "distillation_loss": 1.2190628051757812, + "epoch": 2.52, + "learning_rate": 4.156100309946464e-05, + "loss": 47.353, + "step": 2980, + "task_loss": 0.8773961663246155 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.43410415782239276, + "compression/movement_sparsity/importance_threshold": -0.003963384414121107, + "compression/movement_sparsity/linear_layer_sparsity": 0.3403221118330479, + "compression/movement_sparsity/model_sparsity": 0.32863100033830855, + "compression_loss": 46.59595489501953, + "distillation_loss": 0.99399733543396, + "epoch": 2.52, + "learning_rate": 4.155630694092233e-05, + "loss": 47.7408, + "step": 2981, + "task_loss": 0.7053678631782532 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.4346817989913063, + "compression/movement_sparsity/importance_threshold": -0.0039593387685531575, + "compression/movement_sparsity/linear_layer_sparsity": 0.34111178983473295, + "compression/movement_sparsity/model_sparsity": 0.32939355047130203, + "compression_loss": 46.6575927734375, + "distillation_loss": 0.9112321138381958, + "epoch": 2.52, + "learning_rate": 4.155161078238001e-05, + "loss": 47.7786, + "step": 2982, + "task_loss": 0.38202929496765137 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.43525904693956197, + "compression/movement_sparsity/importance_threshold": -0.003955295876998422, + "compression/movement_sparsity/linear_layer_sparsity": 0.3418831403907617, + "compression/movement_sparsity/model_sparsity": 0.3301384027627793, + "compression_loss": 46.71919631958008, + "distillation_loss": 0.5843592882156372, + "epoch": 2.52, + "learning_rate": 4.1546914623837704e-05, + "loss": 47.5869, + "step": 2983, + "task_loss": 1.0483797788619995 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.4358359018010447, + "compression/movement_sparsity/importance_threshold": -0.003951255738519205, + "compression/movement_sparsity/linear_layer_sparsity": 0.34270929442124476, + "compression/movement_sparsity/model_sparsity": 0.33093617586076823, + "compression_loss": 46.78078079223633, + "distillation_loss": 1.1728811264038086, + "epoch": 2.52, + "learning_rate": 4.154221846529539e-05, + "loss": 48.0374, + "step": 2984, + "task_loss": 0.6150769591331482 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.43641236370963965, + "compression/movement_sparsity/importance_threshold": -0.00394721835217781, + "compression/movement_sparsity/linear_layer_sparsity": 0.3434724530741077, + "compression/movement_sparsity/model_sparsity": 0.33167311766615465, + "compression_loss": 46.84231185913086, + "distillation_loss": 0.9625322818756104, + "epoch": 2.52, + "learning_rate": 4.153752230675308e-05, + "loss": 47.7726, + "step": 2985, + "task_loss": 0.5456115007400513 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.43698843279923205, + "compression/movement_sparsity/importance_threshold": -0.003943183717036542, + "compression/movement_sparsity/linear_layer_sparsity": 0.3442326068367263, + "compression/movement_sparsity/model_sparsity": 0.33240715780852087, + "compression_loss": 46.90380096435547, + "distillation_loss": 0.8806748390197754, + "epoch": 2.52, + "learning_rate": 4.153282614821077e-05, + "loss": 47.7267, + "step": 2986, + "task_loss": 1.218402624130249 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.43756410920370714, + "compression/movement_sparsity/importance_threshold": -0.003939151832157703, + "compression/movement_sparsity/linear_layer_sparsity": 0.34499141316840204, + "compression/movement_sparsity/model_sparsity": 0.3331398968083422, + "compression_loss": 46.965213775634766, + "distillation_loss": 0.9695619344711304, + "epoch": 2.52, + "learning_rate": 4.152812998966845e-05, + "loss": 48.1035, + "step": 2987, + "task_loss": 0.7791354656219482 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.4381393930569496, + "compression/movement_sparsity/importance_threshold": -0.0039351226966036, + "compression/movement_sparsity/linear_layer_sparsity": 0.3458051064437057, + "compression/movement_sparsity/model_sparsity": 0.33392563721642593, + "compression_loss": 47.02659225463867, + "distillation_loss": 1.5583608150482178, + "epoch": 2.53, + "learning_rate": 4.152343383112614e-05, + "loss": 48.2211, + "step": 2988, + "task_loss": 1.8578460216522217 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.4387142844928451, + "compression/movement_sparsity/importance_threshold": -0.003931096309436534, + "compression/movement_sparsity/linear_layer_sparsity": 0.3465370237773626, + "compression/movement_sparsity/model_sparsity": 0.33463241093803064, + "compression_loss": 47.08794021606445, + "distillation_loss": 1.1930773258209229, + "epoch": 2.53, + "learning_rate": 4.151873767258383e-05, + "loss": 48.296, + "step": 2989, + "task_loss": 0.5154758095741272 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.4392887836452787, + "compression/movement_sparsity/importance_threshold": -0.00392707266971881, + "compression/movement_sparsity/linear_layer_sparsity": 0.34717675537102666, + "compression/movement_sparsity/model_sparsity": 0.3352501657834077, + "compression_loss": 47.14923095703125, + "distillation_loss": 0.7614381909370422, + "epoch": 2.53, + "learning_rate": 4.1514041514041515e-05, + "loss": 48.1385, + "step": 2990, + "task_loss": 0.5609140992164612 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.43986289064813533, + "compression/movement_sparsity/importance_threshold": -0.003923051776512731, + "compression/movement_sparsity/linear_layer_sparsity": 0.34801950784285896, + "compression/movement_sparsity/model_sparsity": 0.3360639671152227, + "compression_loss": 47.21049880981445, + "distillation_loss": 0.7212532758712769, + "epoch": 2.53, + "learning_rate": 4.15093453554992e-05, + "loss": 47.9919, + "step": 2991, + "task_loss": 0.6041557788848877 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.4404366056353003, + "compression/movement_sparsity/importance_threshold": -0.003919033628880603, + "compression/movement_sparsity/linear_layer_sparsity": 0.3488089354370236, + "compression/movement_sparsity/model_sparsity": 0.3368262754429645, + "compression_loss": 47.27173614501953, + "distillation_loss": 0.9841560125350952, + "epoch": 2.53, + "learning_rate": 4.150464919695689e-05, + "loss": 48.3682, + "step": 2992, + "task_loss": 1.401531457901001 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.4410099287406586, + "compression/movement_sparsity/importance_threshold": -0.003915018225884729, + "compression/movement_sparsity/linear_layer_sparsity": 0.34959409417917464, + "compression/movement_sparsity/model_sparsity": 0.3375844615668918, + "compression_loss": 47.33286666870117, + "distillation_loss": 1.1893310546875, + "epoch": 2.53, + "learning_rate": 4.149995303841458e-05, + "loss": 48.6276, + "step": 2993, + "task_loss": 0.9419896006584167 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.44158286009809544, + "compression/movement_sparsity/importance_threshold": -0.003911005566587413, + "compression/movement_sparsity/linear_layer_sparsity": 0.3502905132657796, + "compression/movement_sparsity/model_sparsity": 0.3382569565154362, + "compression_loss": 47.39405059814453, + "distillation_loss": 1.1872023344039917, + "epoch": 2.53, + "learning_rate": 4.149525687987227e-05, + "loss": 48.2131, + "step": 2994, + "task_loss": 1.1679962873458862 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.442155399841496, + "compression/movement_sparsity/importance_threshold": -0.003906995650050959, + "compression/movement_sparsity/linear_layer_sparsity": 0.3509968413356899, + "compression/movement_sparsity/model_sparsity": 0.3389390200432258, + "compression_loss": 47.455116271972656, + "distillation_loss": 1.6993533372879028, + "epoch": 2.53, + "learning_rate": 4.1490560721329954e-05, + "loss": 48.4907, + "step": 2995, + "task_loss": 1.8787879943847656 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.44272754810474546, + "compression/movement_sparsity/importance_threshold": -0.00390298847533767, + "compression/movement_sparsity/linear_layer_sparsity": 0.3516989959469277, + "compression/movement_sparsity/model_sparsity": 0.3396170534834873, + "compression_loss": 47.516204833984375, + "distillation_loss": 0.6081333160400391, + "epoch": 2.53, + "learning_rate": 4.148586456278764e-05, + "loss": 48.4946, + "step": 2996, + "task_loss": 0.8442394733428955 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.4432993050217289, + "compression/movement_sparsity/importance_threshold": -0.0038989840415098514, + "compression/movement_sparsity/linear_layer_sparsity": 0.3523986822554649, + "compression/movement_sparsity/model_sparsity": 0.3402927034148394, + "compression_loss": 47.577247619628906, + "distillation_loss": 1.686255693435669, + "epoch": 2.53, + "learning_rate": 4.1481168404245327e-05, + "loss": 48.5204, + "step": 2997, + "task_loss": 0.6181730628013611 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.44387067072633113, + "compression/movement_sparsity/importance_threshold": -0.003894982347629808, + "compression/movement_sparsity/linear_layer_sparsity": 0.3530277536432625, + "compression/movement_sparsity/model_sparsity": 0.3409001642652162, + "compression_loss": 47.638206481933594, + "distillation_loss": 1.110093593597412, + "epoch": 2.53, + "learning_rate": 4.147647224570302e-05, + "loss": 48.7818, + "step": 2998, + "task_loss": 1.2993837594985962 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.4444416453524378, + "compression/movement_sparsity/importance_threshold": -0.003890983392759842, + "compression/movement_sparsity/linear_layer_sparsity": 0.35370602414672564, + "compression/movement_sparsity/model_sparsity": 0.34155513409028126, + "compression_loss": 47.69911575317383, + "distillation_loss": 1.4712867736816406, + "epoch": 2.53, + "learning_rate": 4.14717760871607e-05, + "loss": 48.8644, + "step": 2999, + "task_loss": 0.6597622632980347 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.445012229033934, + "compression/movement_sparsity/importance_threshold": -0.0038869871759622567, + "compression/movement_sparsity/linear_layer_sparsity": 0.35441882703966227, + "compression/movement_sparsity/model_sparsity": 0.3422434500110073, + "compression_loss": 47.760009765625, + "distillation_loss": 1.3652691841125488, + "epoch": 2.54, + "learning_rate": 4.146707992861839e-05, + "loss": 48.9519, + "step": 3000, + "task_loss": 1.496319055557251 + }, + { + "epoch": 2.54, + "eval_accuracy": 0.8596831683168317, + "eval_loss": 48.44996643066406, + "eval_runtime": 228.2323, + "eval_samples_per_second": 110.633, + "eval_steps_per_second": 0.868, + "step": 3000 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.44558242190470465, + "compression/movement_sparsity/importance_threshold": -0.0038829936962993575, + "compression/movement_sparsity/linear_layer_sparsity": 0.3551592940015141, + "compression/movement_sparsity/model_sparsity": 0.3429584796547767, + "compression_loss": 47.820838928222656, + "distillation_loss": 1.1190260648727417, + "epoch": 2.54, + "learning_rate": 4.146238377007608e-05, + "loss": 48.8438, + "step": 3001, + "task_loss": 0.5882004499435425 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.446152224098635, + "compression/movement_sparsity/importance_threshold": -0.0038790029528334483, + "compression/movement_sparsity/linear_layer_sparsity": 0.35590436369207334, + "compression/movement_sparsity/model_sparsity": 0.3436779539093628, + "compression_loss": 47.881649017333984, + "distillation_loss": 1.0594462156295776, + "epoch": 2.54, + "learning_rate": 4.145768761153377e-05, + "loss": 49.1306, + "step": 3002, + "task_loss": 1.8961745500564575 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.4467216357496101, + "compression/movement_sparsity/importance_threshold": -0.0038750149446268333, + "compression/movement_sparsity/linear_layer_sparsity": 0.35665101929692816, + "compression/movement_sparsity/model_sparsity": 0.34439895959720956, + "compression_loss": 47.94240951538086, + "distillation_loss": 0.5419052839279175, + "epoch": 2.54, + "learning_rate": 4.145299145299146e-05, + "loss": 49.1317, + "step": 3003, + "task_loss": 1.178877592086792 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.44729065699151516, + "compression/movement_sparsity/importance_threshold": -0.0038710296707418156, + "compression/movement_sparsity/linear_layer_sparsity": 0.35721275491008603, + "compression/movement_sparsity/model_sparsity": 0.34494139786395456, + "compression_loss": 48.00312805175781, + "distillation_loss": 1.1679743528366089, + "epoch": 2.54, + "learning_rate": 4.144829529444914e-05, + "loss": 49.3554, + "step": 3004, + "task_loss": 1.26522696018219 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.4478592879582354, + "compression/movement_sparsity/importance_threshold": -0.0038670471302406995, + "compression/movement_sparsity/linear_layer_sparsity": 0.3578508648169516, + "compression/movement_sparsity/model_sparsity": 0.3455575867324636, + "compression_loss": 48.063846588134766, + "distillation_loss": 0.7495870590209961, + "epoch": 2.54, + "learning_rate": 4.144359913590683e-05, + "loss": 48.8428, + "step": 3005, + "task_loss": 1.313217282295227 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.4484275287836559, + "compression/movement_sparsity/importance_threshold": -0.003863067322185789, + "compression/movement_sparsity/linear_layer_sparsity": 0.35861726684341144, + "compression/movement_sparsity/model_sparsity": 0.34629766049158617, + "compression_loss": 48.124446868896484, + "distillation_loss": 1.5468318462371826, + "epoch": 2.54, + "learning_rate": 4.143890297736452e-05, + "loss": 49.4221, + "step": 3006, + "task_loss": 1.7378158569335938 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.4489953796016617, + "compression/movement_sparsity/importance_threshold": -0.0038590902456393885, + "compression/movement_sparsity/linear_layer_sparsity": 0.35940969932782035, + "compression/movement_sparsity/model_sparsity": 0.3470628704823482, + "compression_loss": 48.185035705566406, + "distillation_loss": 0.7872080206871033, + "epoch": 2.54, + "learning_rate": 4.143420681882221e-05, + "loss": 48.995, + "step": 3007, + "task_loss": 0.330612450838089 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.4495628405461377, + "compression/movement_sparsity/importance_threshold": -0.0038551158996638036, + "compression/movement_sparsity/linear_layer_sparsity": 0.3601791658653626, + "compression/movement_sparsity/model_sparsity": 0.3478059034771699, + "compression_loss": 48.24555587768555, + "distillation_loss": 1.0740365982055664, + "epoch": 2.54, + "learning_rate": 4.142951066027989e-05, + "loss": 49.3152, + "step": 3008, + "task_loss": 1.4286096096038818 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.4501299117509697, + "compression/movement_sparsity/importance_threshold": -0.003851144283321334, + "compression/movement_sparsity/linear_layer_sparsity": 0.36090744632789057, + "compression/movement_sparsity/model_sparsity": 0.34850916526535725, + "compression_loss": 48.30609893798828, + "distillation_loss": 0.64019775390625, + "epoch": 2.54, + "learning_rate": 4.142481450173758e-05, + "loss": 49.0841, + "step": 3009, + "task_loss": 0.7700440883636475 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.45069659335004253, + "compression/movement_sparsity/importance_threshold": -0.0038471753956742867, + "compression/movement_sparsity/linear_layer_sparsity": 0.36162944275407444, + "compression/movement_sparsity/model_sparsity": 0.3492063588931809, + "compression_loss": 48.36655807495117, + "distillation_loss": 0.82325279712677, + "epoch": 2.54, + "learning_rate": 4.142011834319527e-05, + "loss": 49.1442, + "step": 3010, + "task_loss": 0.943035364151001 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.45126288547724125, + "compression/movement_sparsity/importance_threshold": -0.0038432092357849647, + "compression/movement_sparsity/linear_layer_sparsity": 0.3622767938908578, + "compression/movement_sparsity/model_sparsity": 0.3498314715269307, + "compression_loss": 48.426971435546875, + "distillation_loss": 0.7042733430862427, + "epoch": 2.54, + "learning_rate": 4.1415422184652956e-05, + "loss": 49.4364, + "step": 3011, + "task_loss": 0.8579145073890686 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.4518287882664511, + "compression/movement_sparsity/importance_threshold": -0.0038392458027156715, + "compression/movement_sparsity/linear_layer_sparsity": 0.36308425082648793, + "compression/movement_sparsity/model_sparsity": 0.3506111898327939, + "compression_loss": 48.48737335205078, + "distillation_loss": 0.6257011890411377, + "epoch": 2.55, + "learning_rate": 4.141072602611064e-05, + "loss": 49.236, + "step": 3012, + "task_loss": 0.36222517490386963 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.45239430185155705, + "compression/movement_sparsity/importance_threshold": -0.0038352850955287138, + "compression/movement_sparsity/linear_layer_sparsity": 0.363910440629474, + "compression/movement_sparsity/model_sparsity": 0.3514089974743902, + "compression_loss": 48.54767990112305, + "distillation_loss": 1.6978936195373535, + "epoch": 2.55, + "learning_rate": 4.140602986756833e-05, + "loss": 49.9977, + "step": 3013, + "task_loss": 0.5435124635696411 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.4529594263664445, + "compression/movement_sparsity/importance_threshold": -0.003831327113286392, + "compression/movement_sparsity/linear_layer_sparsity": 0.3645647316318214, + "compression/movement_sparsity/model_sparsity": 0.35204081156797246, + "compression_loss": 48.60795211791992, + "distillation_loss": 1.3760325908660889, + "epoch": 2.55, + "learning_rate": 4.140133370902602e-05, + "loss": 49.7061, + "step": 3014, + "task_loss": 0.5093219876289368 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.45352416194499834, + "compression/movement_sparsity/importance_threshold": -0.003827371855051012, + "compression/movement_sparsity/linear_layer_sparsity": 0.3653375130879664, + "compression/movement_sparsity/model_sparsity": 0.3527870456037451, + "compression_loss": 48.668235778808594, + "distillation_loss": 0.7905077934265137, + "epoch": 2.55, + "learning_rate": 4.139663755048371e-05, + "loss": 49.5013, + "step": 3015, + "task_loss": 0.4139029383659363 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.45408850872110396, + "compression/movement_sparsity/importance_threshold": -0.0038234193198848775, + "compression/movement_sparsity/linear_layer_sparsity": 0.36603408718875063, + "compression/movement_sparsity/model_sparsity": 0.35345969024125473, + "compression_loss": 48.72843551635742, + "distillation_loss": 1.47310471534729, + "epoch": 2.55, + "learning_rate": 4.1391941391941394e-05, + "loss": 50.0369, + "step": 3016, + "task_loss": 1.5562087297439575 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.45465246682864624, + "compression/movement_sparsity/importance_threshold": -0.003819469506850293, + "compression/movement_sparsity/linear_layer_sparsity": 0.36682802211828164, + "compression/movement_sparsity/model_sparsity": 0.35422635106352685, + "compression_loss": 48.78861618041992, + "distillation_loss": 0.8547554016113281, + "epoch": 2.55, + "learning_rate": 4.138724523339908e-05, + "loss": 49.8, + "step": 3017, + "task_loss": 1.096545696258545 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.4552160364015102, + "compression/movement_sparsity/importance_threshold": -0.0038155224150095636, + "compression/movement_sparsity/linear_layer_sparsity": 0.36765699025232684, + "compression/movement_sparsity/model_sparsity": 0.35502684159196335, + "compression_loss": 48.848758697509766, + "distillation_loss": 0.8751019239425659, + "epoch": 2.55, + "learning_rate": 4.138254907485677e-05, + "loss": 49.6146, + "step": 3018, + "task_loss": 0.6735438704490662 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.45577921757358153, + "compression/movement_sparsity/importance_threshold": -0.0038115780434249897, + "compression/movement_sparsity/linear_layer_sparsity": 0.3684541685554548, + "compression/movement_sparsity/model_sparsity": 0.35579663436797154, + "compression_loss": 48.908836364746094, + "distillation_loss": 1.1700892448425293, + "epoch": 2.55, + "learning_rate": 4.137785291631446e-05, + "loss": 49.8204, + "step": 3019, + "task_loss": 1.5246810913085938 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.456342010478745, + "compression/movement_sparsity/importance_threshold": -0.003807636391158877, + "compression/movement_sparsity/linear_layer_sparsity": 0.3693198273533156, + "compression/movement_sparsity/model_sparsity": 0.35663255512304787, + "compression_loss": 48.96885681152344, + "distillation_loss": 0.6335422992706299, + "epoch": 2.55, + "learning_rate": 4.1373156757772146e-05, + "loss": 49.8475, + "step": 3020, + "task_loss": 0.4357015788555145 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.4569044152508859, + "compression/movement_sparsity/importance_threshold": -0.0038036974572735302, + "compression/movement_sparsity/linear_layer_sparsity": 0.37017668611546106, + "compression/movement_sparsity/model_sparsity": 0.35745997815070774, + "compression_loss": 49.02882766723633, + "distillation_loss": 0.8161166906356812, + "epoch": 2.55, + "learning_rate": 4.136846059922983e-05, + "loss": 50.0513, + "step": 3021, + "task_loss": 0.6460744142532349 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.45746643202388926, + "compression/movement_sparsity/importance_threshold": -0.003799761240831252, + "compression/movement_sparsity/linear_layer_sparsity": 0.3710223003875259, + "compression/movement_sparsity/model_sparsity": 0.3582765429711134, + "compression_loss": 49.08876037597656, + "distillation_loss": 0.9189770221710205, + "epoch": 2.55, + "learning_rate": 4.136376444068752e-05, + "loss": 50.2145, + "step": 3022, + "task_loss": 1.298644781112671 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.45802806093164017, + "compression/movement_sparsity/importance_threshold": -0.0037958277408943484, + "compression/movement_sparsity/linear_layer_sparsity": 0.3716790954650769, + "compression/movement_sparsity/model_sparsity": 0.35891077511721253, + "compression_loss": 49.14867401123047, + "distillation_loss": 0.6936419010162354, + "epoch": 2.56, + "learning_rate": 4.1359068282145205e-05, + "loss": 49.9848, + "step": 3023, + "task_loss": 0.2733449935913086 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.45858930210802396, + "compression/movement_sparsity/importance_threshold": -0.0037918969565251207, + "compression/movement_sparsity/linear_layer_sparsity": 0.37237336820150735, + "compression/movement_sparsity/model_sparsity": 0.3595811974493138, + "compression_loss": 49.208499908447266, + "distillation_loss": 1.0767335891723633, + "epoch": 2.56, + "learning_rate": 4.13543721236029e-05, + "loss": 50.1063, + "step": 3024, + "task_loss": 0.5466818809509277 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.4591501556869255, + "compression/movement_sparsity/importance_threshold": -0.0037879688867858757, + "compression/movement_sparsity/linear_layer_sparsity": 0.3731552954942291, + "compression/movement_sparsity/model_sparsity": 0.3603362631340409, + "compression_loss": 49.26829147338867, + "distillation_loss": 0.908270001411438, + "epoch": 2.56, + "learning_rate": 4.134967596506058e-05, + "loss": 50.0475, + "step": 3025, + "task_loss": 0.8433714509010315 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.4597106218022302, + "compression/movement_sparsity/importance_threshold": -0.0037840435307389154, + "compression/movement_sparsity/linear_layer_sparsity": 0.37391599776855894, + "compression/movement_sparsity/model_sparsity": 0.3610708329450536, + "compression_loss": 49.32806396484375, + "distillation_loss": 0.8883118629455566, + "epoch": 2.56, + "learning_rate": 4.134497980651827e-05, + "loss": 50.2666, + "step": 3026, + "task_loss": 1.3919849395751953 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.46027070058782316, + "compression/movement_sparsity/importance_threshold": -0.003780120887446544, + "compression/movement_sparsity/linear_layer_sparsity": 0.37479120782469605, + "compression/movement_sparsity/model_sparsity": 0.3619159768433013, + "compression_loss": 49.387760162353516, + "distillation_loss": 0.9273070096969604, + "epoch": 2.56, + "learning_rate": 4.134028364797596e-05, + "loss": 50.3281, + "step": 3027, + "task_loss": 0.4517025649547577 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.46083039217758914, + "compression/movement_sparsity/importance_threshold": -0.0037762009559710675, + "compression/movement_sparsity/linear_layer_sparsity": 0.3756021466172758, + "compression/movement_sparsity/model_sparsity": 0.3626990573936164, + "compression_loss": 49.447383880615234, + "distillation_loss": 0.6709448099136353, + "epoch": 2.56, + "learning_rate": 4.1335587489433644e-05, + "loss": 50.7968, + "step": 3028, + "task_loss": 0.4689825177192688 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.46138969670541374, + "compression/movement_sparsity/importance_threshold": -0.0037722837353747875, + "compression/movement_sparsity/linear_layer_sparsity": 0.3764737436506192, + "compression/movement_sparsity/model_sparsity": 0.3635407123875184, + "compression_loss": 49.5069694519043, + "distillation_loss": 0.9528563022613525, + "epoch": 2.56, + "learning_rate": 4.133089133089133e-05, + "loss": 50.6858, + "step": 3029, + "task_loss": 0.37805211544036865 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.4619486143051821, + "compression/movement_sparsity/importance_threshold": -0.0037683692247200075, + "compression/movement_sparsity/linear_layer_sparsity": 0.3771501539839312, + "compression/movement_sparsity/model_sparsity": 0.3641938859449995, + "compression_loss": 49.5665397644043, + "distillation_loss": 1.7192420959472656, + "epoch": 2.56, + "learning_rate": 4.1326195172349016e-05, + "loss": 50.7692, + "step": 3030, + "task_loss": 0.5860130190849304 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.46250714511077906, + "compression/movement_sparsity/importance_threshold": -0.003764457423069034, + "compression/movement_sparsity/linear_layer_sparsity": 0.37780412303375244, + "compression/movement_sparsity/model_sparsity": 0.3648253891461153, + "compression_loss": 49.62607955932617, + "distillation_loss": 1.0017911195755005, + "epoch": 2.56, + "learning_rate": 4.132149901380671e-05, + "loss": 50.4827, + "step": 3031, + "task_loss": 1.1313321590423584 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.46306528925609003, + "compression/movement_sparsity/importance_threshold": -0.0037605483294841685, + "compression/movement_sparsity/linear_layer_sparsity": 0.37836743263703826, + "compression/movement_sparsity/model_sparsity": 0.3653693473315852, + "compression_loss": 49.685585021972656, + "distillation_loss": 1.3471577167510986, + "epoch": 2.56, + "learning_rate": 4.1316802855264396e-05, + "loss": 50.8951, + "step": 3032, + "task_loss": 1.1773598194122314 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.463623046875, + "compression/movement_sparsity/importance_threshold": -0.0037566419430277165, + "compression/movement_sparsity/linear_layer_sparsity": 0.3791244861160716, + "compression/movement_sparsity/model_sparsity": 0.3661003936946448, + "compression_loss": 49.74504470825195, + "distillation_loss": 0.7267760038375854, + "epoch": 2.56, + "learning_rate": 4.131210669672209e-05, + "loss": 50.8365, + "step": 3033, + "task_loss": 1.2858234643936157 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.4641804181013941, + "compression/movement_sparsity/importance_threshold": -0.003752738262761982, + "compression/movement_sparsity/linear_layer_sparsity": 0.3800208854180976, + "compression/movement_sparsity/model_sparsity": 0.3669659989229994, + "compression_loss": 49.804466247558594, + "distillation_loss": 1.079267978668213, + "epoch": 2.56, + "learning_rate": 4.130741053817977e-05, + "loss": 50.9071, + "step": 3034, + "task_loss": 1.036117434501648 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.46473740306915756, + "compression/movement_sparsity/importance_threshold": -0.003748837287749268, + "compression/movement_sparsity/linear_layer_sparsity": 0.3807544721352235, + "compression/movement_sparsity/model_sparsity": 0.36767438467961533, + "compression_loss": 49.863868713378906, + "distillation_loss": 0.864710807800293, + "epoch": 2.57, + "learning_rate": 4.130271437963746e-05, + "loss": 50.7787, + "step": 3035, + "task_loss": 0.7965424060821533 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.4652940019121755, + "compression/movement_sparsity/importance_threshold": -0.003744939017051879, + "compression/movement_sparsity/linear_layer_sparsity": 0.38147685013477173, + "compression/movement_sparsity/model_sparsity": 0.36837194677258445, + "compression_loss": 49.92317199707031, + "distillation_loss": 1.1272435188293457, + "epoch": 2.57, + "learning_rate": 4.129801822109515e-05, + "loss": 50.9183, + "step": 3036, + "task_loss": 1.9793462753295898 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.46585021476433297, + "compression/movement_sparsity/importance_threshold": -0.00374104344973212, + "compression/movement_sparsity/linear_layer_sparsity": 0.38217776463257536, + "compression/movement_sparsity/model_sparsity": 0.36904878270112335, + "compression_loss": 49.982486724853516, + "distillation_loss": 0.7878408432006836, + "epoch": 2.57, + "learning_rate": 4.1293322062552834e-05, + "loss": 51.0231, + "step": 3037, + "task_loss": 1.057770013809204 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.4664060417595153, + "compression/movement_sparsity/importance_threshold": -0.003737150584852293, + "compression/movement_sparsity/linear_layer_sparsity": 0.3828524340374125, + "compression/movement_sparsity/model_sparsity": 0.36970027513637843, + "compression_loss": 50.04176712036133, + "distillation_loss": 0.6824671030044556, + "epoch": 2.57, + "learning_rate": 4.128862590401052e-05, + "loss": 50.9101, + "step": 3038, + "task_loss": 0.7235848307609558 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.4669614830316071, + "compression/movement_sparsity/importance_threshold": -0.0037332604214747054, + "compression/movement_sparsity/linear_layer_sparsity": 0.38356084883665914, + "compression/movement_sparsity/model_sparsity": 0.3703843537079321, + "compression_loss": 50.10099411010742, + "distillation_loss": 2.0175516605377197, + "epoch": 2.57, + "learning_rate": 4.128392974546821e-05, + "loss": 51.5324, + "step": 3039, + "task_loss": 1.3853259086608887 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.4675165387144943, + "compression/movement_sparsity/importance_threshold": -0.003729372958661657, + "compression/movement_sparsity/linear_layer_sparsity": 0.3843623317643037, + "compression/movement_sparsity/model_sparsity": 0.3711583032313622, + "compression_loss": 50.160194396972656, + "distillation_loss": 1.2447974681854248, + "epoch": 2.57, + "learning_rate": 4.12792335869259e-05, + "loss": 51.314, + "step": 3040, + "task_loss": 1.5853455066680908 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.46807120894206156, + "compression/movement_sparsity/importance_threshold": -0.0037254881954754534, + "compression/movement_sparsity/linear_layer_sparsity": 0.38501275933633705, + "compression/movement_sparsity/model_sparsity": 0.371786386615347, + "compression_loss": 50.21932601928711, + "distillation_loss": 0.7990949153900146, + "epoch": 2.57, + "learning_rate": 4.1274537428383586e-05, + "loss": 51.0394, + "step": 3041, + "task_loss": 0.6759595274925232 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.4686254938481941, + "compression/movement_sparsity/importance_threshold": -0.0037216061309783993, + "compression/movement_sparsity/linear_layer_sparsity": 0.3857994682202808, + "compression/movement_sparsity/model_sparsity": 0.3725460696289276, + "compression_loss": 50.27841567993164, + "distillation_loss": 1.5597389936447144, + "epoch": 2.57, + "learning_rate": 4.126984126984127e-05, + "loss": 51.5301, + "step": 3042, + "task_loss": 0.8850700855255127 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.4691793935667772, + "compression/movement_sparsity/importance_threshold": -0.003717726764232797, + "compression/movement_sparsity/linear_layer_sparsity": 0.38665283320130894, + "compression/movement_sparsity/model_sparsity": 0.37337011889759975, + "compression_loss": 50.33747100830078, + "distillation_loss": 1.25918447971344, + "epoch": 2.57, + "learning_rate": 4.126514511129896e-05, + "loss": 51.408, + "step": 3043, + "task_loss": 1.4573487043380737 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.4697329082316958, + "compression/movement_sparsity/importance_threshold": -0.0037138500943009523, + "compression/movement_sparsity/linear_layer_sparsity": 0.38737806107692213, + "compression/movement_sparsity/model_sparsity": 0.3740704329646237, + "compression_loss": 50.39646911621094, + "distillation_loss": 1.6351810693740845, + "epoch": 2.57, + "learning_rate": 4.1260448952756645e-05, + "loss": 51.6377, + "step": 3044, + "task_loss": 1.0610618591308594 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.4702860379768351, + "compression/movement_sparsity/importance_threshold": -0.0037099761202451687, + "compression/movement_sparsity/linear_layer_sparsity": 0.3881967148059623, + "compression/movement_sparsity/model_sparsity": 0.3748609634195979, + "compression_loss": 50.455448150634766, + "distillation_loss": 1.4475548267364502, + "epoch": 2.57, + "learning_rate": 4.125575279421434e-05, + "loss": 51.6234, + "step": 3045, + "task_loss": 1.4041566848754883 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.4708387829360803, + "compression/movement_sparsity/importance_threshold": -0.00370610484112775, + "compression/movement_sparsity/linear_layer_sparsity": 0.38897395590080314, + "compression/movement_sparsity/model_sparsity": 0.3756115038917576, + "compression_loss": 50.51437759399414, + "distillation_loss": 0.965828537940979, + "epoch": 2.57, + "learning_rate": 4.125105663567202e-05, + "loss": 51.2874, + "step": 3046, + "task_loss": 0.6727010607719421 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.47139114324331644, + "compression/movement_sparsity/importance_threshold": -0.003702236256011, + "compression/movement_sparsity/linear_layer_sparsity": 0.38978890121370857, + "compression/movement_sparsity/model_sparsity": 0.3763984533260997, + "compression_loss": 50.57326889038086, + "distillation_loss": 0.9208170175552368, + "epoch": 2.58, + "learning_rate": 4.124636047712971e-05, + "loss": 51.3711, + "step": 3047, + "task_loss": 0.9423845410346985 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.4719431190324288, + "compression/movement_sparsity/importance_threshold": -0.0036983703639572223, + "compression/movement_sparsity/linear_layer_sparsity": 0.3904407000650201, + "compression/movement_sparsity/model_sparsity": 0.3770278608817009, + "compression_loss": 50.63212585449219, + "distillation_loss": 1.2266002893447876, + "epoch": 2.58, + "learning_rate": 4.12416643185874e-05, + "loss": 51.55, + "step": 3048, + "task_loss": 1.8993918895721436 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.4724947104373022, + "compression/movement_sparsity/importance_threshold": -0.0036945071640287234, + "compression/movement_sparsity/linear_layer_sparsity": 0.39135028749645134, + "compression/movement_sparsity/model_sparsity": 0.3779062011866443, + "compression_loss": 50.69087219238281, + "distillation_loss": 1.4201915264129639, + "epoch": 2.58, + "learning_rate": 4.1236968160045084e-05, + "loss": 51.8787, + "step": 3049, + "task_loss": 1.2274408340454102 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.47304591759182224, + "compression/movement_sparsity/importance_threshold": -0.0036906466552878037, + "compression/movement_sparsity/linear_layer_sparsity": 0.392159330346377, + "compression/movement_sparsity/model_sparsity": 0.3786874509257681, + "compression_loss": 50.749637603759766, + "distillation_loss": 1.0182191133499146, + "epoch": 2.58, + "learning_rate": 4.123227200150278e-05, + "loss": 52.0471, + "step": 3050, + "task_loss": 1.4530220031738281 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.4735967406298738, + "compression/movement_sparsity/importance_threshold": -0.0036867888367967693, + "compression/movement_sparsity/linear_layer_sparsity": 0.3931106762128691, + "compression/movement_sparsity/model_sparsity": 0.3796061151350639, + "compression_loss": 50.8083381652832, + "distillation_loss": 0.9240841269493103, + "epoch": 2.58, + "learning_rate": 4.122757584296046e-05, + "loss": 51.8435, + "step": 3051, + "task_loss": 0.48819324374198914 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.47414717968534215, + "compression/movement_sparsity/importance_threshold": -0.003682933707617923, + "compression/movement_sparsity/linear_layer_sparsity": 0.3937904491614544, + "compression/movement_sparsity/model_sparsity": 0.38026253579163916, + "compression_loss": 50.86696243286133, + "distillation_loss": 1.5159823894500732, + "epoch": 2.58, + "learning_rate": 4.122287968441815e-05, + "loss": 52.0238, + "step": 3052, + "task_loss": 1.1704357862472534 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.47469723489211224, + "compression/movement_sparsity/importance_threshold": -0.0036790812668135693, + "compression/movement_sparsity/linear_layer_sparsity": 0.3945180141739242, + "compression/movement_sparsity/model_sparsity": 0.38096510670767886, + "compression_loss": 50.92556381225586, + "distillation_loss": 1.1936275959014893, + "epoch": 2.58, + "learning_rate": 4.1218183525875836e-05, + "loss": 51.7775, + "step": 3053, + "task_loss": 0.8866280317306519 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.4752469063840694, + "compression/movement_sparsity/importance_threshold": -0.0036752315134460124, + "compression/movement_sparsity/linear_layer_sparsity": 0.39528418964119894, + "compression/movement_sparsity/model_sparsity": 0.38170496169062124, + "compression_loss": 50.9841194152832, + "distillation_loss": 0.818263828754425, + "epoch": 2.58, + "learning_rate": 4.121348736733352e-05, + "loss": 52.0774, + "step": 3054, + "task_loss": 1.1661381721496582 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.4757961942950987, + "compression/movement_sparsity/importance_threshold": -0.0036713844465775564, + "compression/movement_sparsity/linear_layer_sparsity": 0.3960466089956684, + "compression/movement_sparsity/model_sparsity": 0.3824411895947884, + "compression_loss": 51.04262161254883, + "distillation_loss": 1.4447181224822998, + "epoch": 2.58, + "learning_rate": 4.120879120879121e-05, + "loss": 52.3121, + "step": 3055, + "task_loss": 1.1622289419174194 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.47634509875908515, + "compression/movement_sparsity/importance_threshold": -0.0036675400652705055, + "compression/movement_sparsity/linear_layer_sparsity": 0.39678727866837, + "compression/movement_sparsity/model_sparsity": 0.3831564149856663, + "compression_loss": 51.10111618041992, + "distillation_loss": 0.9131793975830078, + "epoch": 2.58, + "learning_rate": 4.1204095050248895e-05, + "loss": 52.1258, + "step": 3056, + "task_loss": 0.8300709128379822 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.4768936199099141, + "compression/movement_sparsity/importance_threshold": -0.003663698368587163, + "compression/movement_sparsity/linear_layer_sparsity": 0.3974621984807275, + "compression/movement_sparsity/model_sparsity": 0.3838081492261731, + "compression_loss": 51.15959167480469, + "distillation_loss": 0.8873772621154785, + "epoch": 2.58, + "learning_rate": 4.119939889170659e-05, + "loss": 52.231, + "step": 3057, + "task_loss": 0.9160637259483337 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.4774417578814706, + "compression/movement_sparsity/importance_threshold": -0.0036598593555898324, + "compression/movement_sparsity/linear_layer_sparsity": 0.3983995096019121, + "compression/movement_sparsity/model_sparsity": 0.38471326082683877, + "compression_loss": 51.21798324584961, + "distillation_loss": 1.5619075298309326, + "epoch": 2.58, + "learning_rate": 4.1194702733164275e-05, + "loss": 52.5645, + "step": 3058, + "task_loss": 1.3484995365142822 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.47798951280763957, + "compression/movement_sparsity/importance_threshold": -0.00365602302534082, + "compression/movement_sparsity/linear_layer_sparsity": 0.3990665475394623, + "compression/movement_sparsity/model_sparsity": 0.3853573839591854, + "compression_loss": 51.2763557434082, + "distillation_loss": 0.710129976272583, + "epoch": 2.59, + "learning_rate": 4.119000657462196e-05, + "loss": 52.3275, + "step": 3059, + "task_loss": 0.647619903087616 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.4785368848223065, + "compression/movement_sparsity/importance_threshold": -0.003652189376902428, + "compression/movement_sparsity/linear_layer_sparsity": 0.4000348495723326, + "compression/movement_sparsity/model_sparsity": 0.3862924218383811, + "compression_loss": 51.33466339111328, + "distillation_loss": 0.5760276317596436, + "epoch": 2.59, + "learning_rate": 4.118531041607965e-05, + "loss": 52.5166, + "step": 3060, + "task_loss": 1.2770187854766846 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.4790838740593565, + "compression/movement_sparsity/importance_threshold": -0.0036483584093369593, + "compression/movement_sparsity/linear_layer_sparsity": 0.4006868511344939, + "compression/movement_sparsity/model_sparsity": 0.38692202514109075, + "compression_loss": 51.39292907714844, + "distillation_loss": 0.5586639046669006, + "epoch": 2.59, + "learning_rate": 4.1180614257537334e-05, + "loss": 52.1561, + "step": 3061, + "task_loss": 0.3985616862773895 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.47963048065267455, + "compression/movement_sparsity/importance_threshold": -0.0036445301217067193, + "compression/movement_sparsity/linear_layer_sparsity": 0.40141410611860523, + "compression/movement_sparsity/model_sparsity": 0.38762429667919984, + "compression_loss": 51.4511833190918, + "distillation_loss": 1.1474469900131226, + "epoch": 2.59, + "learning_rate": 4.117591809899503e-05, + "loss": 52.6888, + "step": 3062, + "task_loss": 0.9431793689727783 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.48017670473614593, + "compression/movement_sparsity/importance_threshold": -0.0036407045130740115, + "compression/movement_sparsity/linear_layer_sparsity": 0.40221552942541156, + "compression/movement_sparsity/model_sparsity": 0.3883981886299509, + "compression_loss": 51.5093879699707, + "distillation_loss": 1.1265697479248047, + "epoch": 2.59, + "learning_rate": 4.117122194045271e-05, + "loss": 53.1513, + "step": 3063, + "task_loss": 2.025576114654541 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.48072254644365564, + "compression/movement_sparsity/importance_threshold": -0.003636881582501141, + "compression/movement_sparsity/linear_layer_sparsity": 0.4029224537037037, + "compression/movement_sparsity/model_sparsity": 0.38908082788453024, + "compression_loss": 51.56752395629883, + "distillation_loss": 1.0329691171646118, + "epoch": 2.59, + "learning_rate": 4.11665257819104e-05, + "loss": 52.7553, + "step": 3064, + "task_loss": 1.479643702507019 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.4812680059090889, + "compression/movement_sparsity/importance_threshold": -0.0036330613290504105, + "compression/movement_sparsity/linear_layer_sparsity": 0.4036648643048802, + "compression/movement_sparsity/model_sparsity": 0.38979773439763415, + "compression_loss": 51.62565231323242, + "distillation_loss": 2.521170139312744, + "epoch": 2.59, + "learning_rate": 4.1161829623368086e-05, + "loss": 53.0655, + "step": 3065, + "task_loss": 2.259763479232788 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.4818130832663309, + "compression/movement_sparsity/importance_threshold": -0.003629243751784124, + "compression/movement_sparsity/linear_layer_sparsity": 0.40445003497119886, + "compression/movement_sparsity/model_sparsity": 0.3905559320360973, + "compression_loss": 51.683719635009766, + "distillation_loss": 0.7968692183494568, + "epoch": 2.59, + "learning_rate": 4.115713346482578e-05, + "loss": 52.5418, + "step": 3066, + "task_loss": 0.3419604003429413 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.48235777864926654, + "compression/movement_sparsity/importance_threshold": -0.0036254288497645872, + "compression/movement_sparsity/linear_layer_sparsity": 0.40522869504198833, + "compression/movement_sparsity/model_sparsity": 0.3913078427380166, + "compression_loss": 51.741756439208984, + "distillation_loss": 0.9649976491928101, + "epoch": 2.59, + "learning_rate": 4.1152437306283465e-05, + "loss": 53.0487, + "step": 3067, + "task_loss": 2.7186105251312256 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.48290209219178126, + "compression/movement_sparsity/importance_threshold": -0.003621616622054102, + "compression/movement_sparsity/linear_layer_sparsity": 0.4060898107319798, + "compression/movement_sparsity/model_sparsity": 0.39213937645495517, + "compression_loss": 51.79972457885742, + "distillation_loss": 1.25900137424469, + "epoch": 2.59, + "learning_rate": 4.1147741147741145e-05, + "loss": 53.079, + "step": 3068, + "task_loss": 1.3018369674682617 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.4834460240277598, + "compression/movement_sparsity/importance_threshold": -0.003617807067714975, + "compression/movement_sparsity/linear_layer_sparsity": 0.4067707403248258, + "compression/movement_sparsity/model_sparsity": 0.3927969140215024, + "compression_loss": 51.85771942138672, + "distillation_loss": 1.1364296674728394, + "epoch": 2.59, + "learning_rate": 4.114304498919884e-05, + "loss": 52.9511, + "step": 3069, + "task_loss": 0.5344680547714233 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.4839895742910879, + "compression/movement_sparsity/importance_threshold": -0.003614000185809506, + "compression/movement_sparsity/linear_layer_sparsity": 0.4074406519867762, + "compression/movement_sparsity/model_sparsity": 0.3934438121569755, + "compression_loss": 51.91560363769531, + "distillation_loss": 1.3081576824188232, + "epoch": 2.59, + "learning_rate": 4.1138348830656524e-05, + "loss": 52.9715, + "step": 3070, + "task_loss": 1.148207426071167 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.4845327431156503, + "compression/movement_sparsity/importance_threshold": -0.0036101959754000017, + "compression/movement_sparsity/linear_layer_sparsity": 0.40820102038441225, + "compression/movement_sparsity/model_sparsity": 0.394178059560986, + "compression_loss": 51.97350311279297, + "distillation_loss": 0.9547584056854248, + "epoch": 2.6, + "learning_rate": 4.113365267211422e-05, + "loss": 53.2431, + "step": 3071, + "task_loss": 1.34943425655365 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.48507553063533226, + "compression/movement_sparsity/importance_threshold": -0.003606394435548766, + "compression/movement_sparsity/linear_layer_sparsity": 0.40889115543467314, + "compression/movement_sparsity/model_sparsity": 0.39484448634916663, + "compression_loss": 52.03136444091797, + "distillation_loss": 1.7997688055038452, + "epoch": 2.6, + "learning_rate": 4.11289565135719e-05, + "loss": 53.5553, + "step": 3072, + "task_loss": 1.2421348094940186 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.4856179369840188, + "compression/movement_sparsity/importance_threshold": -0.0036025955653181028, + "compression/movement_sparsity/linear_layer_sparsity": 0.4097182514743995, + "compression/movement_sparsity/model_sparsity": 0.39564316909548336, + "compression_loss": 52.089202880859375, + "distillation_loss": 1.3371782302856445, + "epoch": 2.6, + "learning_rate": 4.112426035502959e-05, + "loss": 53.4844, + "step": 3073, + "task_loss": 0.8282544612884521 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.4861599622955952, + "compression/movement_sparsity/importance_threshold": -0.0035987993637703153, + "compression/movement_sparsity/linear_layer_sparsity": 0.4104068244587, + "compression/movement_sparsity/model_sparsity": 0.39630808747947494, + "compression_loss": 52.1469612121582, + "distillation_loss": 1.2503807544708252, + "epoch": 2.6, + "learning_rate": 4.1119564196487276e-05, + "loss": 53.2571, + "step": 3074, + "task_loss": 1.4017322063446045 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.48670160670394647, + "compression/movement_sparsity/importance_threshold": -0.003595005829967709, + "compression/movement_sparsity/linear_layer_sparsity": 0.41106533661639055, + "compression/movement_sparsity/model_sparsity": 0.3969439777187284, + "compression_loss": 52.2047119140625, + "distillation_loss": 1.8732569217681885, + "epoch": 2.6, + "learning_rate": 4.111486803794496e-05, + "loss": 53.2659, + "step": 3075, + "task_loss": 1.2021640539169312 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.4872428703429579, + "compression/movement_sparsity/importance_threshold": -0.0035912149629725863, + "compression/movement_sparsity/linear_layer_sparsity": 0.4119359200954849, + "compression/movement_sparsity/model_sparsity": 0.39778465397708784, + "compression_loss": 52.26237106323242, + "distillation_loss": 1.1679730415344238, + "epoch": 2.6, + "learning_rate": 4.111017187940265e-05, + "loss": 53.5597, + "step": 3076, + "task_loss": 1.2722549438476562 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.48778375334651447, + "compression/movement_sparsity/importance_threshold": -0.003587426761847252, + "compression/movement_sparsity/linear_layer_sparsity": 0.41250750507111, + "compression/movement_sparsity/model_sparsity": 0.39833660325039916, + "compression_loss": 52.32002258300781, + "distillation_loss": 0.9417812824249268, + "epoch": 2.6, + "learning_rate": 4.1105475720860335e-05, + "loss": 53.3077, + "step": 3077, + "task_loss": 1.0897536277770996 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.4883242558485015, + "compression/movement_sparsity/importance_threshold": -0.003583641225654009, + "compression/movement_sparsity/linear_layer_sparsity": 0.4132177681163402, + "compression/movement_sparsity/model_sparsity": 0.39902246657500096, + "compression_loss": 52.37763214111328, + "distillation_loss": 1.0463371276855469, + "epoch": 2.6, + "learning_rate": 4.110077956231803e-05, + "loss": 53.3656, + "step": 3078, + "task_loss": 1.164680004119873 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.4888643779828038, + "compression/movement_sparsity/importance_threshold": -0.0035798583534551644, + "compression/movement_sparsity/linear_layer_sparsity": 0.41401896486396145, + "compression/movement_sparsity/model_sparsity": 0.39979613974957195, + "compression_loss": 52.4351692199707, + "distillation_loss": 0.8574877381324768, + "epoch": 2.6, + "learning_rate": 4.1096083403775715e-05, + "loss": 53.4403, + "step": 3079, + "task_loss": 0.6933873891830444 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.4894041198833069, + "compression/movement_sparsity/importance_threshold": -0.0035760781443130185, + "compression/movement_sparsity/linear_layer_sparsity": 0.4147886937331917, + "compression/movement_sparsity/model_sparsity": 0.4005394260641812, + "compression_loss": 52.4926872253418, + "distillation_loss": 2.0362396240234375, + "epoch": 2.6, + "learning_rate": 4.10913872452334e-05, + "loss": 53.9478, + "step": 3080, + "task_loss": 1.576593279838562 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.4899434816838958, + "compression/movement_sparsity/importance_threshold": -0.003572300597289877, + "compression/movement_sparsity/linear_layer_sparsity": 0.41552476067718586, + "compression/movement_sparsity/model_sparsity": 0.4012502068442424, + "compression_loss": 52.5501594543457, + "distillation_loss": 1.1411570310592651, + "epoch": 2.6, + "learning_rate": 4.108669108669109e-05, + "loss": 53.6176, + "step": 3081, + "task_loss": 1.3787283897399902 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.4904824635184556, + "compression/movement_sparsity/importance_threshold": -0.003568525711448043, + "compression/movement_sparsity/linear_layer_sparsity": 0.4162437760614608, + "compression/movement_sparsity/model_sparsity": 0.4019445218381174, + "compression_loss": 52.60761642456055, + "distillation_loss": 2.795780658721924, + "epoch": 2.6, + "learning_rate": 4.1081994928148774e-05, + "loss": 54.0081, + "step": 3082, + "task_loss": 1.7832911014556885 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.49102106552087144, + "compression/movement_sparsity/importance_threshold": -0.003564753485849822, + "compression/movement_sparsity/linear_layer_sparsity": 0.41701823882524247, + "compression/movement_sparsity/model_sparsity": 0.40269237942343705, + "compression_loss": 52.66501998901367, + "distillation_loss": 1.7607216835021973, + "epoch": 2.61, + "learning_rate": 4.107729876960647e-05, + "loss": 54.1451, + "step": 3083, + "task_loss": 0.8505529761314392 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.49155928782502856, + "compression/movement_sparsity/importance_threshold": -0.0035609839195575163, + "compression/movement_sparsity/linear_layer_sparsity": 0.41772236092414017, + "compression/movement_sparsity/model_sparsity": 0.40337231276210467, + "compression_loss": 52.72239685058594, + "distillation_loss": 1.2642372846603394, + "epoch": 2.61, + "learning_rate": 4.107260261106415e-05, + "loss": 54.0129, + "step": 3084, + "task_loss": 0.7006797194480896 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.49209713056481197, + "compression/movement_sparsity/importance_threshold": -0.003557217011633431, + "compression/movement_sparsity/linear_layer_sparsity": 0.41842098598175775, + "compression/movement_sparsity/model_sparsity": 0.40404693789977103, + "compression_loss": 52.779762268066406, + "distillation_loss": 1.3705413341522217, + "epoch": 2.61, + "learning_rate": 4.106790645252184e-05, + "loss": 53.8318, + "step": 3085, + "task_loss": 2.092862129211426 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.4926345938741069, + "compression/movement_sparsity/importance_threshold": -0.0035534527611398694, + "compression/movement_sparsity/linear_layer_sparsity": 0.41915570549480907, + "compression/movement_sparsity/model_sparsity": 0.4047564175372874, + "compression_loss": 52.83705520629883, + "distillation_loss": 1.0804543495178223, + "epoch": 2.61, + "learning_rate": 4.1063210293979526e-05, + "loss": 54.0107, + "step": 3086, + "task_loss": 0.5854677557945251 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.4931716778867985, + "compression/movement_sparsity/importance_threshold": -0.0035496911671391356, + "compression/movement_sparsity/linear_layer_sparsity": 0.4198811480054397, + "compression/movement_sparsity/model_sparsity": 0.4054569388659557, + "compression_loss": 52.894283294677734, + "distillation_loss": 0.3311106562614441, + "epoch": 2.61, + "learning_rate": 4.105851413543721e-05, + "loss": 53.9928, + "step": 3087, + "task_loss": 0.21367910504341125 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.49370838273677176, + "compression/movement_sparsity/importance_threshold": -0.0035459322286935345, + "compression/movement_sparsity/linear_layer_sparsity": 0.4205838630525564, + "compression/movement_sparsity/model_sparsity": 0.4061355134893996, + "compression_loss": 52.951480865478516, + "distillation_loss": 0.7947851419448853, + "epoch": 2.61, + "learning_rate": 4.1053817976894905e-05, + "loss": 54.0601, + "step": 3088, + "task_loss": 0.9420960545539856 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.4942447085579118, + "compression/movement_sparsity/importance_threshold": -0.0035421759448653702, + "compression/movement_sparsity/linear_layer_sparsity": 0.4212521411035407, + "compression/movement_sparsity/model_sparsity": 0.4067808341334688, + "compression_loss": 53.00864028930664, + "distillation_loss": 1.6943480968475342, + "epoch": 2.61, + "learning_rate": 4.1049121818352585e-05, + "loss": 54.5701, + "step": 3089, + "task_loss": 1.3856816291809082 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.4947806554841041, + "compression/movement_sparsity/importance_threshold": -0.003538422314716944, + "compression/movement_sparsity/linear_layer_sparsity": 0.42210064102417344, + "compression/movement_sparsity/model_sparsity": 0.40760018547153676, + "compression_loss": 53.06572341918945, + "distillation_loss": 1.337015151977539, + "epoch": 2.61, + "learning_rate": 4.104442565981028e-05, + "loss": 54.4026, + "step": 3090, + "task_loss": 1.0604528188705444 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.49531622364923356, + "compression/movement_sparsity/importance_threshold": -0.003534671337310563, + "compression/movement_sparsity/linear_layer_sparsity": 0.42272626632752425, + "compression/movement_sparsity/model_sparsity": 0.40820431862106893, + "compression_loss": 53.122798919677734, + "distillation_loss": 1.3674101829528809, + "epoch": 2.61, + "learning_rate": 4.1039729501267964e-05, + "loss": 54.068, + "step": 3091, + "task_loss": 0.5697746872901917 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.4958514131871853, + "compression/movement_sparsity/importance_threshold": -0.003530923011708529, + "compression/movement_sparsity/linear_layer_sparsity": 0.4234179038229072, + "compression/movement_sparsity/model_sparsity": 0.4088721962407597, + "compression_loss": 53.179805755615234, + "distillation_loss": 0.7628318667411804, + "epoch": 2.61, + "learning_rate": 4.103503334272565e-05, + "loss": 54.146, + "step": 3092, + "task_loss": 0.4658435881137848 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.49638622423184464, + "compression/movement_sparsity/importance_threshold": -0.0035271773369731467, + "compression/movement_sparsity/linear_layer_sparsity": 0.4240316049586222, + "compression/movement_sparsity/model_sparsity": 0.4094648148544973, + "compression_loss": 53.23674011230469, + "distillation_loss": 1.1078250408172607, + "epoch": 2.61, + "learning_rate": 4.103033718418334e-05, + "loss": 54.5533, + "step": 3093, + "task_loss": 0.37736544013023376 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.4969206569170964, + "compression/movement_sparsity/importance_threshold": -0.0035234343121667215, + "compression/movement_sparsity/linear_layer_sparsity": 0.42496176157922533, + "compression/movement_sparsity/model_sparsity": 0.4103630177336863, + "compression_loss": 53.293678283691406, + "distillation_loss": 1.5750502347946167, + "epoch": 2.61, + "learning_rate": 4.1025641025641023e-05, + "loss": 54.2085, + "step": 3094, + "task_loss": 1.7453081607818604 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.49745471137682606, + "compression/movement_sparsity/importance_threshold": -0.003519693936351555, + "compression/movement_sparsity/linear_layer_sparsity": 0.42568883770082205, + "compression/movement_sparsity/model_sparsity": 0.4110651165537584, + "compression_loss": 53.350547790527344, + "distillation_loss": 1.0486245155334473, + "epoch": 2.62, + "learning_rate": 4.1020944867098717e-05, + "loss": 54.7001, + "step": 3095, + "task_loss": 1.222252607345581 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.4979883877449186, + "compression/movement_sparsity/importance_threshold": -0.0035159562085899524, + "compression/movement_sparsity/linear_layer_sparsity": 0.4262994147046164, + "compression/movement_sparsity/model_sparsity": 0.4116547183591179, + "compression_loss": 53.4073600769043, + "distillation_loss": 1.1021398305892944, + "epoch": 2.62, + "learning_rate": 4.10162487085564e-05, + "loss": 54.8112, + "step": 3096, + "task_loss": 0.9640390276908875 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.49852168615525916, + "compression/movement_sparsity/importance_threshold": -0.0035122211279442176, + "compression/movement_sparsity/linear_layer_sparsity": 0.4270995263529828, + "compression/movement_sparsity/model_sparsity": 0.41242734371093154, + "compression_loss": 53.464141845703125, + "distillation_loss": 1.5863444805145264, + "epoch": 2.62, + "learning_rate": 4.1011552550014096e-05, + "loss": 54.8146, + "step": 3097, + "task_loss": 1.0937414169311523 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.49905460674173285, + "compression/movement_sparsity/importance_threshold": -0.0035084886934766547, + "compression/movement_sparsity/linear_layer_sparsity": 0.42782007995488275, + "compression/movement_sparsity/model_sparsity": 0.41312314407992407, + "compression_loss": 53.520896911621094, + "distillation_loss": 1.2905917167663574, + "epoch": 2.62, + "learning_rate": 4.1006856391471776e-05, + "loss": 54.5744, + "step": 3098, + "task_loss": 1.0547055006027222 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.4995871496382247, + "compression/movement_sparsity/importance_threshold": -0.003504758904249569, + "compression/movement_sparsity/linear_layer_sparsity": 0.42862054932827826, + "compression/movement_sparsity/model_sparsity": 0.4138961148678116, + "compression_loss": 53.57758712768555, + "distillation_loss": 2.74406099319458, + "epoch": 2.62, + "learning_rate": 4.100216023292946e-05, + "loss": 55.3062, + "step": 3099, + "task_loss": 2.2682535648345947 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5001193149786202, + "compression/movement_sparsity/importance_threshold": -0.003501031759325262, + "compression/movement_sparsity/linear_layer_sparsity": 0.4293903020458438, + "compression/movement_sparsity/model_sparsity": 0.41463942421149236, + "compression_loss": 53.63422393798828, + "distillation_loss": 0.7816734313964844, + "epoch": 2.62, + "learning_rate": 4.0997464074387155e-05, + "loss": 55.0809, + "step": 3100, + "task_loss": 0.3357618749141693 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5006511028968043, + "compression/movement_sparsity/importance_threshold": -0.003497307257766038, + "compression/movement_sparsity/linear_layer_sparsity": 0.43018934051912294, + "compression/movement_sparsity/model_sparsity": 0.41541101325508456, + "compression_loss": 53.690826416015625, + "distillation_loss": 2.4887375831604004, + "epoch": 2.62, + "learning_rate": 4.099276791584484e-05, + "loss": 55.239, + "step": 3101, + "task_loss": 1.9428654909133911 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5011825135266621, + "compression/movement_sparsity/importance_threshold": -0.003493585398634202, + "compression/movement_sparsity/linear_layer_sparsity": 0.4310135747586167, + "compression/movement_sparsity/model_sparsity": 0.4162069325128106, + "compression_loss": 53.74742126464844, + "distillation_loss": 1.369639277458191, + "epoch": 2.62, + "learning_rate": 4.098807175730253e-05, + "loss": 54.8908, + "step": 3102, + "task_loss": 1.6790672540664673 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5017135470020788, + "compression/movement_sparsity/importance_threshold": -0.0034898661809920576, + "compression/movement_sparsity/linear_layer_sparsity": 0.43180612648470196, + "compression/movement_sparsity/model_sparsity": 0.41697225764893053, + "compression_loss": 53.80393981933594, + "distillation_loss": 1.4634547233581543, + "epoch": 2.62, + "learning_rate": 4.0983375598760214e-05, + "loss": 55.1121, + "step": 3103, + "task_loss": 0.6079245805740356 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5022442034569394, + "compression/movement_sparsity/importance_threshold": -0.0034861496039019094, + "compression/movement_sparsity/linear_layer_sparsity": 0.43256221410615675, + "compression/movement_sparsity/model_sparsity": 0.41770237133459076, + "compression_loss": 53.86044692993164, + "distillation_loss": 0.9265747666358948, + "epoch": 2.62, + "learning_rate": 4.097867944021791e-05, + "loss": 55.0913, + "step": 3104, + "task_loss": 0.9253263473510742 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5027744830251293, + "compression/movement_sparsity/importance_threshold": -0.0034824356664260605, + "compression/movement_sparsity/linear_layer_sparsity": 0.43337142389442934, + "compression/movement_sparsity/model_sparsity": 0.4184837822772157, + "compression_loss": 53.916927337646484, + "distillation_loss": 1.2841525077819824, + "epoch": 2.62, + "learning_rate": 4.0973983281675594e-05, + "loss": 55.3996, + "step": 3105, + "task_loss": 1.0232633352279663 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5033043858405335, + "compression/movement_sparsity/importance_threshold": -0.0034787243676268147, + "compression/movement_sparsity/linear_layer_sparsity": 0.434119689261915, + "compression/movement_sparsity/model_sparsity": 0.41920634242739474, + "compression_loss": 53.97336196899414, + "distillation_loss": 1.4775420427322388, + "epoch": 2.63, + "learning_rate": 4.096928712313327e-05, + "loss": 55.2664, + "step": 3106, + "task_loss": 1.2546635866165161 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5038339120370371, + "compression/movement_sparsity/importance_threshold": -0.003475015706566477, + "compression/movement_sparsity/linear_layer_sparsity": 0.4347484506213541, + "compression/movement_sparsity/model_sparsity": 0.41981350389984085, + "compression_loss": 54.029762268066406, + "distillation_loss": 1.0330852270126343, + "epoch": 2.63, + "learning_rate": 4.0964590964590966e-05, + "loss": 55.2517, + "step": 3107, + "task_loss": 0.8182848691940308 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5043630617485253, + "compression/movement_sparsity/importance_threshold": -0.00347130968230735, + "compression/movement_sparsity/linear_layer_sparsity": 0.43553159417917464, + "compression/movement_sparsity/model_sparsity": 0.4205697440672189, + "compression_loss": 54.08612060546875, + "distillation_loss": 1.984728455543518, + "epoch": 2.63, + "learning_rate": 4.095989480604865e-05, + "loss": 55.7509, + "step": 3108, + "task_loss": 1.3393394947052002 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5048918351088829, + "compression/movement_sparsity/importance_threshold": -0.0034676062939117415, + "compression/movement_sparsity/linear_layer_sparsity": 0.4363137003344109, + "compression/movement_sparsity/model_sparsity": 0.42132498246998284, + "compression_loss": 54.14242172241211, + "distillation_loss": 1.0384163856506348, + "epoch": 2.63, + "learning_rate": 4.0955198647506346e-05, + "loss": 55.4472, + "step": 3109, + "task_loss": 1.0590802431106567 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5054202322519957, + "compression/movement_sparsity/importance_threshold": -0.0034639055404419496, + "compression/movement_sparsity/linear_layer_sparsity": 0.43706835705574937, + "compression/movement_sparsity/model_sparsity": 0.42205371441134776, + "compression_loss": 54.19868469238281, + "distillation_loss": 0.694888710975647, + "epoch": 2.63, + "learning_rate": 4.095050248896403e-05, + "loss": 55.3893, + "step": 3110, + "task_loss": 0.8273056149482727 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5059482533117485, + "compression/movement_sparsity/importance_threshold": -0.0034602074209602817, + "compression/movement_sparsity/linear_layer_sparsity": 0.43780465055892864, + "compression/movement_sparsity/model_sparsity": 0.422764713967589, + "compression_loss": 54.254920959472656, + "distillation_loss": 1.1678038835525513, + "epoch": 2.63, + "learning_rate": 4.094580633042172e-05, + "loss": 55.9293, + "step": 3111, + "task_loss": 0.4315330386161804 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5064758984220263, + "compression/movement_sparsity/importance_threshold": -0.003456511934529042, + "compression/movement_sparsity/linear_layer_sparsity": 0.4384777578978055, + "compression/movement_sparsity/model_sparsity": 0.42341469799865505, + "compression_loss": 54.3111457824707, + "distillation_loss": 1.1576933860778809, + "epoch": 2.63, + "learning_rate": 4.0941110171879405e-05, + "loss": 55.7686, + "step": 3112, + "task_loss": 1.7973809242248535 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5070031677167146, + "compression/movement_sparsity/importance_threshold": -0.003452819080210533, + "compression/movement_sparsity/linear_layer_sparsity": 0.43915708965218825, + "compression/movement_sparsity/model_sparsity": 0.4240706926174059, + "compression_loss": 54.367271423339844, + "distillation_loss": 1.1927461624145508, + "epoch": 2.63, + "learning_rate": 4.093641401333709e-05, + "loss": 55.5985, + "step": 3113, + "task_loss": 1.561108112335205 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5075300613296982, + "compression/movement_sparsity/importance_threshold": -0.003449128857067059, + "compression/movement_sparsity/linear_layer_sparsity": 0.4398047031206596, + "compression/movement_sparsity/model_sparsity": 0.42469605857094317, + "compression_loss": 54.42340850830078, + "distillation_loss": 0.9795883297920227, + "epoch": 2.63, + "learning_rate": 4.0931717854794784e-05, + "loss": 55.5081, + "step": 3114, + "task_loss": 0.977310299873352 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5080565793948624, + "compression/movement_sparsity/importance_threshold": -0.0034454412641609254, + "compression/movement_sparsity/linear_layer_sparsity": 0.44055916905531595, + "compression/movement_sparsity/model_sparsity": 0.42542460627973533, + "compression_loss": 54.4794921875, + "distillation_loss": 1.2904127836227417, + "epoch": 2.63, + "learning_rate": 4.0927021696252464e-05, + "loss": 55.9005, + "step": 3115, + "task_loss": 1.4650390148162842 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5085827220460923, + "compression/movement_sparsity/importance_threshold": -0.003441756300554434, + "compression/movement_sparsity/linear_layer_sparsity": 0.44129987450052044, + "compression/movement_sparsity/model_sparsity": 0.4261398662142206, + "compression_loss": 54.535526275634766, + "distillation_loss": 1.3306338787078857, + "epoch": 2.63, + "learning_rate": 4.092232553771016e-05, + "loss": 55.7659, + "step": 3116, + "task_loss": 1.3768657445907593 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.509108489417273, + "compression/movement_sparsity/importance_threshold": -0.003438073965309891, + "compression/movement_sparsity/linear_layer_sparsity": 0.44211108754895584, + "compression/movement_sparsity/model_sparsity": 0.42692321159885904, + "compression_loss": 54.59151077270508, + "distillation_loss": 1.842881679534912, + "epoch": 2.63, + "learning_rate": 4.091762937916784e-05, + "loss": 55.9591, + "step": 3117, + "task_loss": 1.8594143390655518 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5096338816422896, + "compression/movement_sparsity/importance_threshold": -0.0034343942574896, + "compression/movement_sparsity/linear_layer_sparsity": 0.4428697865631229, + "compression/movement_sparsity/model_sparsity": 0.4276558469678583, + "compression_loss": 54.64747619628906, + "distillation_loss": 1.9885168075561523, + "epoch": 2.64, + "learning_rate": 4.091293322062553e-05, + "loss": 56.5996, + "step": 3118, + "task_loss": 1.7155975103378296 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5101588988550273, + "compression/movement_sparsity/importance_threshold": -0.0034307171761558644, + "compression/movement_sparsity/linear_layer_sparsity": 0.44356760077734125, + "compression/movement_sparsity/model_sparsity": 0.42832968911709063, + "compression_loss": 54.70343780517578, + "distillation_loss": 1.353642463684082, + "epoch": 2.64, + "learning_rate": 4.0908237062083216e-05, + "loss": 55.8876, + "step": 3119, + "task_loss": 0.4191083014011383 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5106835411893714, + "compression/movement_sparsity/importance_threshold": -0.0034270427203709868, + "compression/movement_sparsity/linear_layer_sparsity": 0.4444180562614663, + "compression/movement_sparsity/model_sparsity": 0.42915092883902883, + "compression_loss": 54.75932312011719, + "distillation_loss": 1.1758902072906494, + "epoch": 2.64, + "learning_rate": 4.09035409035409e-05, + "loss": 55.8899, + "step": 3120, + "task_loss": 0.31539657711982727 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5112078087792069, + "compression/movement_sparsity/importance_threshold": -0.003423370889197273, + "compression/movement_sparsity/linear_layer_sparsity": 0.44519647784890304, + "compression/movement_sparsity/model_sparsity": 0.42990260925023227, + "compression_loss": 54.81515884399414, + "distillation_loss": 1.3323016166687012, + "epoch": 2.64, + "learning_rate": 4.0898844744998595e-05, + "loss": 55.8952, + "step": 3121, + "task_loss": 0.7238379716873169 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.511731701758419, + "compression/movement_sparsity/importance_threshold": -0.0034197016816970263, + "compression/movement_sparsity/linear_layer_sparsity": 0.4459388884500795, + "compression/movement_sparsity/model_sparsity": 0.4306195157633362, + "compression_loss": 54.870967864990234, + "distillation_loss": 3.1670589447021484, + "epoch": 2.64, + "learning_rate": 4.089414858645628e-05, + "loss": 56.7952, + "step": 3122, + "task_loss": 0.8310158848762512 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5122552202608928, + "compression/movement_sparsity/importance_threshold": -0.0034160350969325505, + "compression/movement_sparsity/linear_layer_sparsity": 0.44663192107307587, + "compression/movement_sparsity/model_sparsity": 0.43128874058371486, + "compression_loss": 54.926700592041016, + "distillation_loss": 1.9490631818771362, + "epoch": 2.64, + "learning_rate": 4.088945242791397e-05, + "loss": 56.5619, + "step": 3123, + "task_loss": 1.2699609994888306 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5127783644205133, + "compression/movement_sparsity/importance_threshold": -0.0034123711339661507, + "compression/movement_sparsity/linear_layer_sparsity": 0.4474319731006041, + "compression/movement_sparsity/model_sparsity": 0.4320613083628496, + "compression_loss": 54.98245620727539, + "distillation_loss": 1.7554600238800049, + "epoch": 2.64, + "learning_rate": 4.0884756269371654e-05, + "loss": 56.4067, + "step": 3124, + "task_loss": 0.4941505193710327 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5133011343711658, + "compression/movement_sparsity/importance_threshold": -0.0034087097918601298, + "compression/movement_sparsity/linear_layer_sparsity": 0.44818618862774007, + "compression/movement_sparsity/model_sparsity": 0.4327896142663901, + "compression_loss": 55.03812789916992, + "distillation_loss": 1.004862666130066, + "epoch": 2.64, + "learning_rate": 4.088006011082934e-05, + "loss": 56.5542, + "step": 3125, + "task_loss": 0.8703616857528687 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5138235302467355, + "compression/movement_sparsity/importance_threshold": -0.003405051069676791, + "compression/movement_sparsity/linear_layer_sparsity": 0.44892269676593677, + "compression/movement_sparsity/model_sparsity": 0.4335008210842757, + "compression_loss": 55.093753814697266, + "distillation_loss": 1.1207001209259033, + "epoch": 2.64, + "learning_rate": 4.0875363952287034e-05, + "loss": 56.6376, + "step": 3126, + "task_loss": 0.8411110639572144 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5143455521811073, + "compression/movement_sparsity/importance_threshold": -0.003401394966478441, + "compression/movement_sparsity/linear_layer_sparsity": 0.44950797068600784, + "compression/movement_sparsity/model_sparsity": 0.43406598904467913, + "compression_loss": 55.14937973022461, + "distillation_loss": 1.105970859527588, + "epoch": 2.64, + "learning_rate": 4.087066779374472e-05, + "loss": 56.1333, + "step": 3127, + "task_loss": 0.5834990739822388 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5148672003081667, + "compression/movement_sparsity/importance_threshold": -0.003397741481327381, + "compression/movement_sparsity/linear_layer_sparsity": 0.45021637356108685, + "compression/movement_sparsity/model_sparsity": 0.43475005610169704, + "compression_loss": 55.204959869384766, + "distillation_loss": 1.938085675239563, + "epoch": 2.64, + "learning_rate": 4.0865971635202406e-05, + "loss": 56.8655, + "step": 3128, + "task_loss": 1.2625503540039062 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5153884747617983, + "compression/movement_sparsity/importance_threshold": -0.003394090613285918, + "compression/movement_sparsity/linear_layer_sparsity": 0.4508935470411275, + "compression/movement_sparsity/model_sparsity": 0.435403966589469, + "compression_loss": 55.260459899902344, + "distillation_loss": 0.907630443572998, + "epoch": 2.64, + "learning_rate": 4.086127547666009e-05, + "loss": 56.6096, + "step": 3129, + "task_loss": 0.9808852076530457 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5159093756758878, + "compression/movement_sparsity/importance_threshold": -0.0033904423614163526, + "compression/movement_sparsity/linear_layer_sparsity": 0.4515794251635423, + "compression/movement_sparsity/model_sparsity": 0.436066282688371, + "compression_loss": 55.315921783447266, + "distillation_loss": 1.4840586185455322, + "epoch": 2.65, + "learning_rate": 4.0856579318117786e-05, + "loss": 57.0072, + "step": 3130, + "task_loss": 1.2430903911590576 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5164299031843201, + "compression/movement_sparsity/importance_threshold": -0.00338679672478099, + "compression/movement_sparsity/linear_layer_sparsity": 0.4522318321474033, + "compression/movement_sparsity/model_sparsity": 0.4366962774852977, + "compression_loss": 55.371368408203125, + "distillation_loss": 1.6558773517608643, + "epoch": 2.65, + "learning_rate": 4.085188315957547e-05, + "loss": 56.7521, + "step": 3131, + "task_loss": 1.2319852113723755 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5169500574209804, + "compression/movement_sparsity/importance_threshold": -0.0033831537024421343, + "compression/movement_sparsity/linear_layer_sparsity": 0.45295492559700967, + "compression/movement_sparsity/model_sparsity": 0.43739453045041443, + "compression_loss": 55.426753997802734, + "distillation_loss": 1.2695668935775757, + "epoch": 2.65, + "learning_rate": 4.084718700103315e-05, + "loss": 56.9505, + "step": 3132, + "task_loss": 1.086560606956482 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5174698385197538, + "compression/movement_sparsity/importance_threshold": -0.00337951329346209, + "compression/movement_sparsity/linear_layer_sparsity": 0.4537304973083815, + "compression/movement_sparsity/model_sparsity": 0.43814345888756295, + "compression_loss": 55.48211669921875, + "distillation_loss": 1.5455138683319092, + "epoch": 2.65, + "learning_rate": 4.0842490842490845e-05, + "loss": 57.1748, + "step": 3133, + "task_loss": 1.409749984741211 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5179892466145253, + "compression/movement_sparsity/importance_threshold": -0.003375875496903162, + "compression/movement_sparsity/linear_layer_sparsity": 0.45439945503692103, + "compression/movement_sparsity/model_sparsity": 0.4387894358601725, + "compression_loss": 55.537445068359375, + "distillation_loss": 1.3241931200027466, + "epoch": 2.65, + "learning_rate": 4.083779468394853e-05, + "loss": 57.2464, + "step": 3134, + "task_loss": 1.2736042737960815 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5185082818391804, + "compression/movement_sparsity/importance_threshold": -0.0033722403118276504, + "compression/movement_sparsity/linear_layer_sparsity": 0.45503917470641747, + "compression/movement_sparsity/model_sparsity": 0.4394071791910138, + "compression_loss": 55.59274673461914, + "distillation_loss": 1.102333426475525, + "epoch": 2.65, + "learning_rate": 4.0833098525406224e-05, + "loss": 57.1977, + "step": 3135, + "task_loss": 1.0903507471084595 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5190269443276039, + "compression/movement_sparsity/importance_threshold": -0.003368607737297863, + "compression/movement_sparsity/linear_layer_sparsity": 0.45562721503338005, + "compression/movement_sparsity/model_sparsity": 0.4399750185237216, + "compression_loss": 55.64801025390625, + "distillation_loss": 1.064621925354004, + "epoch": 2.65, + "learning_rate": 4.0828402366863904e-05, + "loss": 56.9824, + "step": 3136, + "task_loss": 1.6531810760498047 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.519545234213681, + "compression/movement_sparsity/importance_threshold": -0.0033649777723761026, + "compression/movement_sparsity/linear_layer_sparsity": 0.4562850952101859, + "compression/movement_sparsity/model_sparsity": 0.440610298492578, + "compression_loss": 55.7032470703125, + "distillation_loss": 1.2678351402282715, + "epoch": 2.65, + "learning_rate": 4.08237062083216e-05, + "loss": 57.1982, + "step": 3137, + "task_loss": 1.4507288932800293 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5200631516312969, + "compression/movement_sparsity/importance_threshold": -0.0033613504161246737, + "compression/movement_sparsity/linear_layer_sparsity": 0.4568534725847169, + "compression/movement_sparsity/model_sparsity": 0.4411591503557606, + "compression_loss": 55.75844192504883, + "distillation_loss": 1.601014256477356, + "epoch": 2.65, + "learning_rate": 4.081901004977928e-05, + "loss": 57.0944, + "step": 3138, + "task_loss": 1.467469334602356 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5205806967143366, + "compression/movement_sparsity/importance_threshold": -0.0033577256676058805, + "compression/movement_sparsity/linear_layer_sparsity": 0.45754954587046043, + "compression/movement_sparsity/model_sparsity": 0.4418313113827669, + "compression_loss": 55.81358337402344, + "distillation_loss": 1.1885582208633423, + "epoch": 2.65, + "learning_rate": 4.081431389123697e-05, + "loss": 57.3256, + "step": 3139, + "task_loss": 1.1096551418304443 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5210978695966855, + "compression/movement_sparsity/importance_threshold": -0.003354103525882025, + "compression/movement_sparsity/linear_layer_sparsity": 0.4582610967257953, + "compression/movement_sparsity/model_sparsity": 0.44251841827723454, + "compression_loss": 55.86868667602539, + "distillation_loss": 1.1433416604995728, + "epoch": 2.65, + "learning_rate": 4.080961773269466e-05, + "loss": 57.1295, + "step": 3140, + "task_loss": 1.646550178527832 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5216146704122288, + "compression/movement_sparsity/importance_threshold": -0.0033504839900154117, + "compression/movement_sparsity/linear_layer_sparsity": 0.45896054455097973, + "compression/movement_sparsity/model_sparsity": 0.4431938379178707, + "compression_loss": 55.92373275756836, + "distillation_loss": 1.1853364706039429, + "epoch": 2.65, + "learning_rate": 4.080492157415234e-05, + "loss": 57.1464, + "step": 3141, + "task_loss": 0.7650507092475891 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5221310992948514, + "compression/movement_sparsity/importance_threshold": -0.0033468670590683456, + "compression/movement_sparsity/linear_layer_sparsity": 0.4596704021745103, + "compression/movement_sparsity/model_sparsity": 0.4438793097482555, + "compression_loss": 55.97872543334961, + "distillation_loss": 0.8294848203659058, + "epoch": 2.66, + "learning_rate": 4.0800225415610036e-05, + "loss": 57.1471, + "step": 3142, + "task_loss": 0.3979688882827759 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5226471563784383, + "compression/movement_sparsity/importance_threshold": -0.003343252732103131, + "compression/movement_sparsity/linear_layer_sparsity": 0.4604246653983168, + "compression/movement_sparsity/model_sparsity": 0.4446076617099392, + "compression_loss": 56.03367614746094, + "distillation_loss": 1.0178886651992798, + "epoch": 2.66, + "learning_rate": 4.079552925706772e-05, + "loss": 57.3249, + "step": 3143, + "task_loss": 1.4002323150634766 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.523162841796875, + "compression/movement_sparsity/importance_threshold": -0.0033396410081820704, + "compression/movement_sparsity/linear_layer_sparsity": 0.4611335333159336, + "compression/movement_sparsity/model_sparsity": 0.44529217783385305, + "compression_loss": 56.08857345581055, + "distillation_loss": 1.1955482959747314, + "epoch": 2.66, + "learning_rate": 4.079083309852541e-05, + "loss": 57.4477, + "step": 3144, + "task_loss": 0.39195290207862854 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5236781556840464, + "compression/movement_sparsity/importance_threshold": -0.003336031886367469, + "compression/movement_sparsity/linear_layer_sparsity": 0.4618110883693386, + "compression/movement_sparsity/model_sparsity": 0.4459464567867705, + "compression_loss": 56.14344787597656, + "distillation_loss": 1.0837123394012451, + "epoch": 2.66, + "learning_rate": 4.0786136939983095e-05, + "loss": 57.8087, + "step": 3145, + "task_loss": 1.658868432044983 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5241930981738379, + "compression/movement_sparsity/importance_threshold": -0.00333242536572163, + "compression/movement_sparsity/linear_layer_sparsity": 0.4625975349215943, + "compression/movement_sparsity/model_sparsity": 0.4467058864805636, + "compression_loss": 56.19829559326172, + "distillation_loss": 1.3995057344436646, + "epoch": 2.66, + "learning_rate": 4.078144078144078e-05, + "loss": 57.9952, + "step": 3146, + "task_loss": 0.6903823018074036 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5247076694001342, + "compression/movement_sparsity/importance_threshold": -0.0033288214453068583, + "compression/movement_sparsity/linear_layer_sparsity": 0.46321326316580735, + "compression/movement_sparsity/model_sparsity": 0.4473004625653863, + "compression_loss": 56.25309371948242, + "distillation_loss": 1.887570858001709, + "epoch": 2.66, + "learning_rate": 4.0776744622898474e-05, + "loss": 57.4979, + "step": 3147, + "task_loss": 1.6376639604568481 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5252218694968209, + "compression/movement_sparsity/importance_threshold": -0.003325220124185457, + "compression/movement_sparsity/linear_layer_sparsity": 0.46376330117051445, + "compression/movement_sparsity/model_sparsity": 0.4478316050725169, + "compression_loss": 56.307865142822266, + "distillation_loss": 1.1850448846817017, + "epoch": 2.66, + "learning_rate": 4.077204846435616e-05, + "loss": 58.0396, + "step": 3148, + "task_loss": 1.1419734954833984 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.525735698597783, + "compression/movement_sparsity/importance_threshold": -0.0033216214014197293, + "compression/movement_sparsity/linear_layer_sparsity": 0.46443825675537487, + "compression/movement_sparsity/model_sparsity": 0.44848337385663106, + "compression_loss": 56.36259078979492, + "distillation_loss": 1.2688450813293457, + "epoch": 2.66, + "learning_rate": 4.076735230581385e-05, + "loss": 57.5172, + "step": 3149, + "task_loss": 0.7160232663154602 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5262491568369052, + "compression/movement_sparsity/importance_threshold": -0.0033180252760719835, + "compression/movement_sparsity/linear_layer_sparsity": 0.46511447630200464, + "compression/movement_sparsity/model_sparsity": 0.4491363631815395, + "compression_loss": 56.41731262207031, + "distillation_loss": 1.0588514804840088, + "epoch": 2.66, + "learning_rate": 4.076265614727153e-05, + "loss": 57.7745, + "step": 3150, + "task_loss": 0.9721276760101318 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5267622443480733, + "compression/movement_sparsity/importance_threshold": -0.0033144317472045184, + "compression/movement_sparsity/linear_layer_sparsity": 0.46584626246981753, + "compression/movement_sparsity/model_sparsity": 0.44984301024325046, + "compression_loss": 56.47195053100586, + "distillation_loss": 1.5750703811645508, + "epoch": 2.66, + "learning_rate": 4.075795998872922e-05, + "loss": 57.9782, + "step": 3151, + "task_loss": 1.784942865371704 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5272749612651723, + "compression/movement_sparsity/importance_threshold": -0.003310840813879639, + "compression/movement_sparsity/linear_layer_sparsity": 0.46647248398155017, + "compression/movement_sparsity/model_sparsity": 0.4504477191195723, + "compression_loss": 56.526580810546875, + "distillation_loss": 2.9790143966674805, + "epoch": 2.66, + "learning_rate": 4.075326383018691e-05, + "loss": 58.8701, + "step": 3152, + "task_loss": 1.8907215595245361 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.527787307722087, + "compression/movement_sparsity/importance_threshold": -0.0033072524751596504, + "compression/movement_sparsity/linear_layer_sparsity": 0.46713847259234836, + "compression/movement_sparsity/model_sparsity": 0.45109082897276903, + "compression_loss": 56.58113479614258, + "distillation_loss": 1.3961204290390015, + "epoch": 2.66, + "learning_rate": 4.074856767164459e-05, + "loss": 57.7439, + "step": 3153, + "task_loss": 0.5844182968139648 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5282992838527029, + "compression/movement_sparsity/importance_threshold": -0.003303666730106857, + "compression/movement_sparsity/linear_layer_sparsity": 0.4679034198703566, + "compression/movement_sparsity/model_sparsity": 0.4518294979585246, + "compression_loss": 56.63566207885742, + "distillation_loss": 1.3831321001052856, + "epoch": 2.67, + "learning_rate": 4.0743871513102285e-05, + "loss": 58.2501, + "step": 3154, + "task_loss": 1.2835367918014526 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5288108897909048, + "compression/movement_sparsity/importance_threshold": -0.0033000835777835624, + "compression/movement_sparsity/linear_layer_sparsity": 0.4685845521740524, + "compression/movement_sparsity/model_sparsity": 0.4524872312721804, + "compression_loss": 56.690185546875, + "distillation_loss": 0.9613630771636963, + "epoch": 2.67, + "learning_rate": 4.073917535455997e-05, + "loss": 57.8719, + "step": 3155, + "task_loss": 1.0267081260681152 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5293221256705782, + "compression/movement_sparsity/importance_threshold": -0.0032965030172520697, + "compression/movement_sparsity/linear_layer_sparsity": 0.4692471066245715, + "compression/movement_sparsity/model_sparsity": 0.45312702493906826, + "compression_loss": 56.74460983276367, + "distillation_loss": 1.402208924293518, + "epoch": 2.67, + "learning_rate": 4.073447919601766e-05, + "loss": 58.3487, + "step": 3156, + "task_loss": 1.5169488191604614 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.529832991625608, + "compression/movement_sparsity/importance_threshold": -0.0032929250475746837, + "compression/movement_sparsity/linear_layer_sparsity": 0.46993381943872087, + "compression/movement_sparsity/model_sparsity": 0.4537901470554758, + "compression_loss": 56.79901123046875, + "distillation_loss": 1.909407615661621, + "epoch": 2.67, + "learning_rate": 4.072978303747535e-05, + "loss": 58.2315, + "step": 3157, + "task_loss": 0.8726487159729004 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5303434877898795, + "compression/movement_sparsity/importance_threshold": -0.003289349667813708, + "compression/movement_sparsity/linear_layer_sparsity": 0.47075058914927453, + "compression/movement_sparsity/model_sparsity": 0.4545788582137945, + "compression_loss": 56.853363037109375, + "distillation_loss": 1.6018075942993164, + "epoch": 2.67, + "learning_rate": 4.072508687893303e-05, + "loss": 58.5589, + "step": 3158, + "task_loss": 1.5681837797164917 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5308536142972777, + "compression/movement_sparsity/importance_threshold": -0.0032857768770314464, + "compression/movement_sparsity/linear_layer_sparsity": 0.4716123964409889, + "compression/movement_sparsity/model_sparsity": 0.45541105977380913, + "compression_loss": 56.9077033996582, + "distillation_loss": 1.9071273803710938, + "epoch": 2.67, + "learning_rate": 4.0720390720390724e-05, + "loss": 58.6694, + "step": 3159, + "task_loss": 1.5857409238815308 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5313633712816875, + "compression/movement_sparsity/importance_threshold": -0.003282206674290206, + "compression/movement_sparsity/linear_layer_sparsity": 0.47234378911126984, + "compression/movement_sparsity/model_sparsity": 0.4561173268558389, + "compression_loss": 56.96199035644531, + "distillation_loss": 1.752554178237915, + "epoch": 2.67, + "learning_rate": 4.071569456184841e-05, + "loss": 58.87, + "step": 3160, + "task_loss": 1.9640041589736938 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5318727588769947, + "compression/movement_sparsity/importance_threshold": -0.003278639058652285, + "compression/movement_sparsity/linear_layer_sparsity": 0.4729347508593032, + "compression/movement_sparsity/model_sparsity": 0.45668798724981635, + "compression_loss": 57.01624298095703, + "distillation_loss": 1.748987078666687, + "epoch": 2.67, + "learning_rate": 4.07109984033061e-05, + "loss": 58.5835, + "step": 3161, + "task_loss": 1.0655022859573364 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5323817772170839, + "compression/movement_sparsity/importance_threshold": -0.003275074029179992, + "compression/movement_sparsity/linear_layer_sparsity": 0.4736234073127772, + "compression/movement_sparsity/model_sparsity": 0.4573529862355585, + "compression_loss": 57.07045364379883, + "distillation_loss": 1.2259505987167358, + "epoch": 2.67, + "learning_rate": 4.070630224476378e-05, + "loss": 58.4556, + "step": 3162, + "task_loss": 0.23454053699970245 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5328904264358405, + "compression/movement_sparsity/importance_threshold": -0.003271511584935629, + "compression/movement_sparsity/linear_layer_sparsity": 0.47419929691291884, + "compression/movement_sparsity/model_sparsity": 0.4579090922562916, + "compression_loss": 57.1246337890625, + "distillation_loss": 1.1683413982391357, + "epoch": 2.67, + "learning_rate": 4.070160608622147e-05, + "loss": 58.5648, + "step": 3163, + "task_loss": 2.109626293182373 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5333987066671495, + "compression/movement_sparsity/importance_threshold": -0.0032679517249815, + "compression/movement_sparsity/linear_layer_sparsity": 0.47478903047168575, + "compression/movement_sparsity/model_sparsity": 0.4584785666530822, + "compression_loss": 57.178768157958984, + "distillation_loss": 1.8769023418426514, + "epoch": 2.67, + "learning_rate": 4.069690992767916e-05, + "loss": 58.7883, + "step": 3164, + "task_loss": 2.054685592651367 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5339066180448961, + "compression/movement_sparsity/importance_threshold": -0.0032643944483799095, + "compression/movement_sparsity/linear_layer_sparsity": 0.47536050812980213, + "compression/movement_sparsity/model_sparsity": 0.45903041229557134, + "compression_loss": 57.23289489746094, + "distillation_loss": 1.4572021961212158, + "epoch": 2.67, + "learning_rate": 4.069221376913685e-05, + "loss": 58.5199, + "step": 3165, + "task_loss": 1.5350674390792847 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5344141607029655, + "compression/movement_sparsity/importance_threshold": -0.0032608397541931612, + "compression/movement_sparsity/linear_layer_sparsity": 0.4759527696121078, + "compression/movement_sparsity/model_sparsity": 0.45960232777395044, + "compression_loss": 57.28696823120117, + "distillation_loss": 1.1777393817901611, + "epoch": 2.68, + "learning_rate": 4.0687517610594535e-05, + "loss": 58.5678, + "step": 3166, + "task_loss": 1.2994797229766846 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5349213347752428, + "compression/movement_sparsity/importance_threshold": -0.0032572876414835587, + "compression/movement_sparsity/linear_layer_sparsity": 0.4767330036730252, + "compression/movement_sparsity/model_sparsity": 0.4603557583945946, + "compression_loss": 57.340965270996094, + "distillation_loss": 2.705673933029175, + "epoch": 2.68, + "learning_rate": 4.068282145205222e-05, + "loss": 58.8753, + "step": 3167, + "task_loss": 2.175607919692993 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.535428140395613, + "compression/movement_sparsity/importance_threshold": -0.0032537381093134077, + "compression/movement_sparsity/linear_layer_sparsity": 0.477335674953677, + "compression/movement_sparsity/model_sparsity": 0.46093772606272226, + "compression_loss": 57.39493942260742, + "distillation_loss": 1.0088363885879517, + "epoch": 2.68, + "learning_rate": 4.0678125293509914e-05, + "loss": 58.8704, + "step": 3168, + "task_loss": 0.5511782765388489 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5359345776979615, + "compression/movement_sparsity/importance_threshold": -0.00325019115674501, + "compression/movement_sparsity/linear_layer_sparsity": 0.4779708395911365, + "compression/movement_sparsity/model_sparsity": 0.4615510708408901, + "compression_loss": 57.44886016845703, + "distillation_loss": 1.4820741415023804, + "epoch": 2.68, + "learning_rate": 4.06734291349676e-05, + "loss": 58.7629, + "step": 3169, + "task_loss": 1.792464017868042 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.536440646816173, + "compression/movement_sparsity/importance_threshold": -0.0032466467828406722, + "compression/movement_sparsity/linear_layer_sparsity": 0.47855704359628315, + "compression/movement_sparsity/model_sparsity": 0.4621171369350855, + "compression_loss": 57.502716064453125, + "distillation_loss": 1.3986766338348389, + "epoch": 2.68, + "learning_rate": 4.066873297642528e-05, + "loss": 58.7571, + "step": 3170, + "task_loss": 0.8458770513534546 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5369463478841332, + "compression/movement_sparsity/importance_threshold": -0.0032431049866626952, + "compression/movement_sparsity/linear_layer_sparsity": 0.4791641148947926, + "compression/movement_sparsity/model_sparsity": 0.4627033534669214, + "compression_loss": 57.556549072265625, + "distillation_loss": 1.7659282684326172, + "epoch": 2.68, + "learning_rate": 4.066403681788297e-05, + "loss": 59.0347, + "step": 3171, + "task_loss": 2.0756285190582275 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5374516810357268, + "compression/movement_sparsity/importance_threshold": -0.0032395657672733853, + "compression/movement_sparsity/linear_layer_sparsity": 0.4798046692560235, + "compression/movement_sparsity/model_sparsity": 0.4633219028152683, + "compression_loss": 57.61033630371094, + "distillation_loss": 0.8603739142417908, + "epoch": 2.68, + "learning_rate": 4.065934065934066e-05, + "loss": 59.0686, + "step": 3172, + "task_loss": 0.22714926302433014 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5379566464048393, + "compression/movement_sparsity/importance_threshold": -0.0032360291237350444, + "compression/movement_sparsity/linear_layer_sparsity": 0.4804801852767628, + "compression/movement_sparsity/model_sparsity": 0.46397421278256484, + "compression_loss": 57.6640739440918, + "distillation_loss": 1.4142537117004395, + "epoch": 2.68, + "learning_rate": 4.065464450079835e-05, + "loss": 59.238, + "step": 3173, + "task_loss": 0.47118258476257324 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5384612441253557, + "compression/movement_sparsity/importance_threshold": -0.0032324950551099775, + "compression/movement_sparsity/linear_layer_sparsity": 0.4810939698816512, + "compression/movement_sparsity/model_sparsity": 0.46456691199805306, + "compression_loss": 57.717811584472656, + "distillation_loss": 1.3783066272735596, + "epoch": 2.68, + "learning_rate": 4.064994834225604e-05, + "loss": 59.0028, + "step": 3174, + "task_loss": 1.300521969795227 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5389654743311609, + "compression/movement_sparsity/importance_threshold": -0.0032289635604604897, + "compression/movement_sparsity/linear_layer_sparsity": 0.48175378177361405, + "compression/movement_sparsity/model_sparsity": 0.46520405732170816, + "compression_loss": 57.771461486816406, + "distillation_loss": 2.1710128784179688, + "epoch": 2.68, + "learning_rate": 4.0645252183713725e-05, + "loss": 59.7202, + "step": 3175, + "task_loss": 1.5312451124191284 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5394693371561404, + "compression/movement_sparsity/importance_threshold": -0.003225434638848883, + "compression/movement_sparsity/linear_layer_sparsity": 0.4824322788362623, + "compression/movement_sparsity/model_sparsity": 0.4658592459229533, + "compression_loss": 57.825103759765625, + "distillation_loss": 1.1456093788146973, + "epoch": 2.68, + "learning_rate": 4.064055602517141e-05, + "loss": 59.2378, + "step": 3176, + "task_loss": 0.935524582862854 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5399728327341792, + "compression/movement_sparsity/importance_threshold": -0.003221908289337463, + "compression/movement_sparsity/linear_layer_sparsity": 0.4833285469724443, + "compression/movement_sparsity/model_sparsity": 0.46672472449141417, + "compression_loss": 57.878692626953125, + "distillation_loss": 2.597580909729004, + "epoch": 2.69, + "learning_rate": 4.06358598666291e-05, + "loss": 59.8918, + "step": 3177, + "task_loss": 1.5374902486801147 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5404759611991623, + "compression/movement_sparsity/importance_threshold": -0.0032183845109885335, + "compression/movement_sparsity/linear_layer_sparsity": 0.4841042140771572, + "compression/movement_sparsity/model_sparsity": 0.46747374504484906, + "compression_loss": 57.9322509765625, + "distillation_loss": 1.9911036491394043, + "epoch": 2.69, + "learning_rate": 4.063116370808679e-05, + "loss": 59.5648, + "step": 3178, + "task_loss": 1.6181129217147827 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.540978722684975, + "compression/movement_sparsity/importance_threshold": -0.0032148633028643977, + "compression/movement_sparsity/linear_layer_sparsity": 0.48483422354399236, + "compression/movement_sparsity/model_sparsity": 0.46817867644072664, + "compression_loss": 57.985809326171875, + "distillation_loss": 1.5482800006866455, + "epoch": 2.69, + "learning_rate": 4.062646754954447e-05, + "loss": 59.3552, + "step": 3179, + "task_loss": 1.6430931091308594 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5414811173255023, + "compression/movement_sparsity/importance_threshold": -0.003211344664027361, + "compression/movement_sparsity/linear_layer_sparsity": 0.4856544393389928, + "compression/movement_sparsity/model_sparsity": 0.46897071529988993, + "compression_loss": 58.03929138183594, + "distillation_loss": 1.9776475429534912, + "epoch": 2.69, + "learning_rate": 4.0621771391002164e-05, + "loss": 59.6949, + "step": 3180, + "task_loss": 1.523226261138916 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5419831452546295, + "compression/movement_sparsity/importance_threshold": -0.0032078285935397254, + "compression/movement_sparsity/linear_layer_sparsity": 0.4862957806952877, + "compression/movement_sparsity/model_sparsity": 0.4695900246075993, + "compression_loss": 58.0927619934082, + "distillation_loss": 0.862718403339386, + "epoch": 2.69, + "learning_rate": 4.061707523245985e-05, + "loss": 59.7823, + "step": 3181, + "task_loss": 0.17751680314540863 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5424848066062418, + "compression/movement_sparsity/importance_threshold": -0.0032043150904637954, + "compression/movement_sparsity/linear_layer_sparsity": 0.4868397612228358, + "compression/movement_sparsity/model_sparsity": 0.47011531773054627, + "compression_loss": 58.14616394042969, + "distillation_loss": 1.832399845123291, + "epoch": 2.69, + "learning_rate": 4.0612379073917537e-05, + "loss": 59.8514, + "step": 3182, + "task_loss": 1.7924641370773315 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5429861015142242, + "compression/movement_sparsity/importance_threshold": -0.0032008041538618753, + "compression/movement_sparsity/linear_layer_sparsity": 0.4874428975460254, + "compression/movement_sparsity/model_sparsity": 0.4706977344655699, + "compression_loss": 58.19955062866211, + "distillation_loss": 1.6733181476593018, + "epoch": 2.69, + "learning_rate": 4.060768291537522e-05, + "loss": 59.327, + "step": 3183, + "task_loss": 1.151419997215271 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5434870301124619, + "compression/movement_sparsity/importance_threshold": -0.003197295782796269, + "compression/movement_sparsity/linear_layer_sparsity": 0.48809196576294833, + "compression/movement_sparsity/model_sparsity": 0.47132450519247415, + "compression_loss": 58.252864837646484, + "distillation_loss": 1.310351848602295, + "epoch": 2.69, + "learning_rate": 4.060298675683291e-05, + "loss": 59.5851, + "step": 3184, + "task_loss": 1.4411141872406006 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5439875925348399, + "compression/movement_sparsity/importance_threshold": -0.0031937899763292816, + "compression/movement_sparsity/linear_layer_sparsity": 0.4888370354535076, + "compression/movement_sparsity/model_sparsity": 0.4720439794470602, + "compression_loss": 58.30614471435547, + "distillation_loss": 1.3653380870819092, + "epoch": 2.69, + "learning_rate": 4.05982905982906e-05, + "loss": 59.873, + "step": 3185, + "task_loss": 1.3209458589553833 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5444877889152435, + "compression/movement_sparsity/importance_threshold": -0.003190286733523215, + "compression/movement_sparsity/linear_layer_sparsity": 0.48947371446025684, + "compression/movement_sparsity/model_sparsity": 0.47265878657127397, + "compression_loss": 58.35938262939453, + "distillation_loss": 1.6706267595291138, + "epoch": 2.69, + "learning_rate": 4.059359443974829e-05, + "loss": 59.7526, + "step": 3186, + "task_loss": 1.3089333772659302 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5449876193875578, + "compression/movement_sparsity/importance_threshold": -0.003186786053440375, + "compression/movement_sparsity/linear_layer_sparsity": 0.49008451802323627, + "compression/movement_sparsity/model_sparsity": 0.4732486071528135, + "compression_loss": 58.412567138671875, + "distillation_loss": 2.037510633468628, + "epoch": 2.69, + "learning_rate": 4.0588898281205975e-05, + "loss": 60.0894, + "step": 3187, + "task_loss": 0.9613476395606995 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5454870840856679, + "compression/movement_sparsity/importance_threshold": -0.0031832879351430645, + "compression/movement_sparsity/linear_layer_sparsity": 0.49073495751943735, + "compression/movement_sparsity/model_sparsity": 0.47387670205133414, + "compression_loss": 58.46571731567383, + "distillation_loss": 1.2410633563995361, + "epoch": 2.69, + "learning_rate": 4.058420212266366e-05, + "loss": 60.0393, + "step": 3188, + "task_loss": 1.5163161754608154 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.545986183143459, + "compression/movement_sparsity/importance_threshold": -0.0031797923776935876, + "compression/movement_sparsity/linear_layer_sparsity": 0.49148907765323224, + "compression/movement_sparsity/model_sparsity": 0.47460491583858827, + "compression_loss": 58.518821716308594, + "distillation_loss": 1.7192423343658447, + "epoch": 2.7, + "learning_rate": 4.057950596412135e-05, + "loss": 60.7557, + "step": 3189, + "task_loss": 1.6420795917510986 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.546484916694816, + "compression/movement_sparsity/importance_threshold": -0.0031762993801542506, + "compression/movement_sparsity/linear_layer_sparsity": 0.49216837363511207, + "compression/movement_sparsity/model_sparsity": 0.4752608759137317, + "compression_loss": 58.57189178466797, + "distillation_loss": 1.687978982925415, + "epoch": 2.7, + "learning_rate": 4.057480980557904e-05, + "loss": 60.0822, + "step": 3190, + "task_loss": 1.119887113571167 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5469832848736245, + "compression/movement_sparsity/importance_threshold": -0.0031728089415873537, + "compression/movement_sparsity/linear_layer_sparsity": 0.49281641637361834, + "compression/movement_sparsity/model_sparsity": 0.4758866563905576, + "compression_loss": 58.624908447265625, + "distillation_loss": 1.252699375152588, + "epoch": 2.7, + "learning_rate": 4.057011364703673e-05, + "loss": 60.2995, + "step": 3191, + "task_loss": 0.2589733302593231 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5474812878137694, + "compression/movement_sparsity/importance_threshold": -0.0031693210610552022, + "compression/movement_sparsity/linear_layer_sparsity": 0.4934945557112375, + "compression/movement_sparsity/model_sparsity": 0.4765414995557289, + "compression_loss": 58.67790985107422, + "distillation_loss": 1.2184535264968872, + "epoch": 2.7, + "learning_rate": 4.0565417488494413e-05, + "loss": 60.3341, + "step": 3192, + "task_loss": 0.4556049406528473 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5479789256491356, + "compression/movement_sparsity/importance_threshold": -0.0031658357376201014, + "compression/movement_sparsity/linear_layer_sparsity": 0.49418818454261565, + "compression/movement_sparsity/model_sparsity": 0.47721130010289736, + "compression_loss": 58.73088073730469, + "distillation_loss": 0.9668209552764893, + "epoch": 2.7, + "learning_rate": 4.05607213299521e-05, + "loss": 60.3313, + "step": 3193, + "task_loss": 1.2551921606063843 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5484761985136087, + "compression/movement_sparsity/importance_threshold": -0.0031623529703443537, + "compression/movement_sparsity/linear_layer_sparsity": 0.49479116585162597, + "compression/movement_sparsity/model_sparsity": 0.4777935671489557, + "compression_loss": 58.78380584716797, + "distillation_loss": 0.7016304135322571, + "epoch": 2.7, + "learning_rate": 4.0556025171409786e-05, + "loss": 60.1835, + "step": 3194, + "task_loss": 0.8164786696434021 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5489731065410736, + "compression/movement_sparsity/importance_threshold": -0.003158872758290264, + "compression/movement_sparsity/linear_layer_sparsity": 0.4954844011854721, + "compression/movement_sparsity/model_sparsity": 0.4784629877164429, + "compression_loss": 58.83666229248047, + "distillation_loss": 1.6767808198928833, + "epoch": 2.7, + "learning_rate": 4.055132901286748e-05, + "loss": 60.4673, + "step": 3195, + "task_loss": 1.5425060987472534 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5494696498654155, + "compression/movement_sparsity/importance_threshold": -0.003155395100520135, + "compression/movement_sparsity/linear_layer_sparsity": 0.4963608513550433, + "compression/movement_sparsity/model_sparsity": 0.4793093291264132, + "compression_loss": 58.8895263671875, + "distillation_loss": 1.538779854774475, + "epoch": 2.7, + "learning_rate": 4.054663285432516e-05, + "loss": 60.2842, + "step": 3196, + "task_loss": 0.3826006352901459 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5499658286205193, + "compression/movement_sparsity/importance_threshold": -0.0031519199960962727, + "compression/movement_sparsity/linear_layer_sparsity": 0.49715571636964995, + "compression/movement_sparsity/model_sparsity": 0.48007688808247734, + "compression_loss": 58.94231414794922, + "distillation_loss": 1.7234290838241577, + "epoch": 2.7, + "learning_rate": 4.054193669578285e-05, + "loss": 60.6546, + "step": 3197, + "task_loss": 0.7311191558837891 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5504616429402704, + "compression/movement_sparsity/importance_threshold": -0.00314844744408098, + "compression/movement_sparsity/linear_layer_sparsity": 0.4976947006709586, + "compression/movement_sparsity/model_sparsity": 0.48059735661492636, + "compression_loss": 58.995086669921875, + "distillation_loss": 2.7852134704589844, + "epoch": 2.7, + "learning_rate": 4.053724053724054e-05, + "loss": 60.6698, + "step": 3198, + "task_loss": 1.1367361545562744 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5509570929585539, + "compression/movement_sparsity/importance_threshold": -0.003144977443536561, + "compression/movement_sparsity/linear_layer_sparsity": 0.4984495124064764, + "compression/movement_sparsity/model_sparsity": 0.4813262382452566, + "compression_loss": 59.047813415527344, + "distillation_loss": 2.6569573879241943, + "epoch": 2.7, + "learning_rate": 4.053254437869823e-05, + "loss": 61.0445, + "step": 3199, + "task_loss": 1.802232265472412 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5514521788092548, + "compression/movement_sparsity/importance_threshold": -0.0031415099935253204, + "compression/movement_sparsity/linear_layer_sparsity": 0.4989969390184712, + "compression/movement_sparsity/model_sparsity": 0.4818548590690482, + "compression_loss": 59.10051345825195, + "distillation_loss": 1.2958006858825684, + "epoch": 2.7, + "learning_rate": 4.052784822015591e-05, + "loss": 60.5185, + "step": 3200, + "task_loss": 1.6842865943908691 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5519469006262585, + "compression/movement_sparsity/importance_threshold": -0.00313804509310956, + "compression/movement_sparsity/linear_layer_sparsity": 0.49967661657371537, + "compression/movement_sparsity/model_sparsity": 0.482511187609337, + "compression_loss": 59.153106689453125, + "distillation_loss": 0.7837212085723877, + "epoch": 2.71, + "learning_rate": 4.05231520616136e-05, + "loss": 60.4069, + "step": 3201, + "task_loss": 1.0939966440200806 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5524412585434499, + "compression/movement_sparsity/importance_threshold": -0.003134582741351586, + "compression/movement_sparsity/linear_layer_sparsity": 0.5004195279899326, + "compression/movement_sparsity/model_sparsity": 0.48322857773294425, + "compression_loss": 59.20572280883789, + "distillation_loss": 1.9697004556655884, + "epoch": 2.71, + "learning_rate": 4.051845590307129e-05, + "loss": 61.1588, + "step": 3202, + "task_loss": 2.5490407943725586 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5529352526947144, + "compression/movement_sparsity/importance_threshold": -0.0031311229373137003, + "compression/movement_sparsity/linear_layer_sparsity": 0.5010503880028755, + "compression/movement_sparsity/model_sparsity": 0.48383776576369025, + "compression_loss": 59.25829315185547, + "distillation_loss": 0.9899154305458069, + "epoch": 2.71, + "learning_rate": 4.051375974452898e-05, + "loss": 60.8375, + "step": 3203, + "task_loss": 0.7529734373092651 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5534288832139369, + "compression/movement_sparsity/importance_threshold": -0.0031276656800582094, + "compression/movement_sparsity/linear_layer_sparsity": 0.5018538980390181, + "compression/movement_sparsity/model_sparsity": 0.4846136727582054, + "compression_loss": 59.31081008911133, + "distillation_loss": 1.1852328777313232, + "epoch": 2.71, + "learning_rate": 4.050906358598667e-05, + "loss": 60.9939, + "step": 3204, + "task_loss": 0.7671400308609009 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5539221502350027, + "compression/movement_sparsity/importance_threshold": -0.003124210968647415, + "compression/movement_sparsity/linear_layer_sparsity": 0.5024524435576679, + "compression/movement_sparsity/model_sparsity": 0.48519165639694817, + "compression_loss": 59.36328887939453, + "distillation_loss": 1.711308479309082, + "epoch": 2.71, + "learning_rate": 4.050436742744435e-05, + "loss": 60.7577, + "step": 3205, + "task_loss": 1.168859839439392 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5544150538917967, + "compression/movement_sparsity/importance_threshold": -0.003120758802143623, + "compression/movement_sparsity/linear_layer_sparsity": 0.5031805213093461, + "compression/movement_sparsity/model_sparsity": 0.485894722438027, + "compression_loss": 59.415714263916016, + "distillation_loss": 1.759476900100708, + "epoch": 2.71, + "learning_rate": 4.049967126890204e-05, + "loss": 61.2151, + "step": 3206, + "task_loss": 1.6526076793670654 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5549075943182042, + "compression/movement_sparsity/importance_threshold": -0.003117309179609136, + "compression/movement_sparsity/linear_layer_sparsity": 0.5038841545173708, + "compression/movement_sparsity/model_sparsity": 0.4865741836807271, + "compression_loss": 59.46808624267578, + "distillation_loss": 2.3347909450531006, + "epoch": 2.71, + "learning_rate": 4.049497511035973e-05, + "loss": 61.1834, + "step": 3207, + "task_loss": 1.211999535560608 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5553997716481105, + "compression/movement_sparsity/importance_threshold": -0.0031138621001062586, + "compression/movement_sparsity/linear_layer_sparsity": 0.504497092506357, + "compression/movement_sparsity/model_sparsity": 0.48716606536417384, + "compression_loss": 59.520450592041016, + "distillation_loss": 1.7178672552108765, + "epoch": 2.71, + "learning_rate": 4.0490278951817415e-05, + "loss": 61.4408, + "step": 3208, + "task_loss": 1.9305129051208496 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5558915860154006, + "compression/movement_sparsity/importance_threshold": -0.003110417562697294, + "compression/movement_sparsity/linear_layer_sparsity": 0.5050956499491744, + "compression/movement_sparsity/model_sparsity": 0.4877440605174524, + "compression_loss": 59.57276153564453, + "distillation_loss": 1.8688201904296875, + "epoch": 2.71, + "learning_rate": 4.04855827932751e-05, + "loss": 61.6319, + "step": 3209, + "task_loss": 2.906649351119995 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5563830375539593, + "compression/movement_sparsity/importance_threshold": -0.003106975566444548, + "compression/movement_sparsity/linear_layer_sparsity": 0.505801250644859, + "compression/movement_sparsity/model_sparsity": 0.48842542165855857, + "compression_loss": 59.62505340576172, + "distillation_loss": 1.2433862686157227, + "epoch": 2.71, + "learning_rate": 4.048088663473279e-05, + "loss": 60.8267, + "step": 3210, + "task_loss": 1.470594048500061 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5568741263976722, + "compression/movement_sparsity/importance_threshold": -0.0031035361104103237, + "compression/movement_sparsity/linear_layer_sparsity": 0.5065314628225439, + "compression/movement_sparsity/model_sparsity": 0.48913054880154466, + "compression_loss": 59.677310943603516, + "distillation_loss": 2.014218807220459, + "epoch": 2.71, + "learning_rate": 4.047619047619048e-05, + "loss": 61.7412, + "step": 3211, + "task_loss": 1.7117359638214111 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5573648526804245, + "compression/movement_sparsity/importance_threshold": -0.0031000991936569232, + "compression/movement_sparsity/linear_layer_sparsity": 0.5071971533291513, + "compression/movement_sparsity/model_sparsity": 0.4897733707913465, + "compression_loss": 59.729522705078125, + "distillation_loss": 1.9074573516845703, + "epoch": 2.71, + "learning_rate": 4.047149431764817e-05, + "loss": 61.5469, + "step": 3212, + "task_loss": 1.7530325651168823 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.557855216536101, + "compression/movement_sparsity/importance_threshold": -0.003096664815246652, + "compression/movement_sparsity/linear_layer_sparsity": 0.5078406410356207, + "compression/movement_sparsity/model_sparsity": 0.49039475271549887, + "compression_loss": 59.78169250488281, + "distillation_loss": 2.1233768463134766, + "epoch": 2.72, + "learning_rate": 4.0466798159105854e-05, + "loss": 61.0694, + "step": 3213, + "task_loss": 1.3763554096221924 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5583452180985871, + "compression/movement_sparsity/importance_threshold": -0.0030932329742418153, + "compression/movement_sparsity/linear_layer_sparsity": 0.5085824912009183, + "compression/movement_sparsity/model_sparsity": 0.49111111804542046, + "compression_loss": 59.83381652832031, + "distillation_loss": 1.0325219631195068, + "epoch": 2.72, + "learning_rate": 4.046210200056354e-05, + "loss": 61.6132, + "step": 3214, + "task_loss": 0.5445683598518372 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5588348575017676, + "compression/movement_sparsity/importance_threshold": -0.0030898036697047156, + "compression/movement_sparsity/linear_layer_sparsity": 0.509191744622105, + "compression/movement_sparsity/model_sparsity": 0.49169944173730673, + "compression_loss": 59.8858757019043, + "distillation_loss": 2.161313056945801, + "epoch": 2.72, + "learning_rate": 4.0457405842021226e-05, + "loss": 61.295, + "step": 3215, + "task_loss": 1.3501858711242676 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5593241348795281, + "compression/movement_sparsity/importance_threshold": -0.0030863769006976554, + "compression/movement_sparsity/linear_layer_sparsity": 0.5100961211922794, + "compression/movement_sparsity/model_sparsity": 0.4925727501901079, + "compression_loss": 59.93793487548828, + "distillation_loss": 2.4283390045166016, + "epoch": 2.72, + "learning_rate": 4.045270968347892e-05, + "loss": 62.1538, + "step": 3216, + "task_loss": 0.999991238117218 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5598130503657535, + "compression/movement_sparsity/importance_threshold": -0.003082952666282942, + "compression/movement_sparsity/linear_layer_sparsity": 0.5107592122303422, + "compression/movement_sparsity/model_sparsity": 0.4932130620111065, + "compression_loss": 59.989952087402344, + "distillation_loss": 1.8886561393737793, + "epoch": 2.72, + "learning_rate": 4.0448013524936606e-05, + "loss": 61.4217, + "step": 3217, + "task_loss": 1.4370546340942383 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5603016040943287, + "compression/movement_sparsity/importance_threshold": -0.0030795309655228783, + "compression/movement_sparsity/linear_layer_sparsity": 0.5113502574475489, + "compression/movement_sparsity/model_sparsity": 0.4937838030068345, + "compression_loss": 60.04193878173828, + "distillation_loss": 1.7901849746704102, + "epoch": 2.72, + "learning_rate": 4.044331736639429e-05, + "loss": 61.66, + "step": 3218, + "task_loss": 1.1233326196670532 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5607897961991393, + "compression/movement_sparsity/importance_threshold": -0.0030761117974797674, + "compression/movement_sparsity/linear_layer_sparsity": 0.5121048783963845, + "compression/movement_sparsity/model_sparsity": 0.494512500404592, + "compression_loss": 60.0938835144043, + "distillation_loss": 1.6310561895370483, + "epoch": 2.72, + "learning_rate": 4.043862120785198e-05, + "loss": 61.6303, + "step": 3219, + "task_loss": 1.484907865524292 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.56127762681407, + "compression/movement_sparsity/importance_threshold": -0.003072695161215915, + "compression/movement_sparsity/linear_layer_sparsity": 0.5128085712252474, + "compression/movement_sparsity/model_sparsity": 0.495192019219971, + "compression_loss": 60.145790100097656, + "distillation_loss": 1.3487658500671387, + "epoch": 2.72, + "learning_rate": 4.0433925049309665e-05, + "loss": 61.7724, + "step": 3220, + "task_loss": 0.9770619869232178 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5617650960730064, + "compression/movement_sparsity/importance_threshold": -0.0030692810557936216, + "compression/movement_sparsity/linear_layer_sparsity": 0.5134304404157929, + "compression/movement_sparsity/model_sparsity": 0.4957925252907279, + "compression_loss": 60.197654724121094, + "distillation_loss": 2.3097586631774902, + "epoch": 2.72, + "learning_rate": 4.042922889076736e-05, + "loss": 62.1246, + "step": 3221, + "task_loss": 1.3383524417877197 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5622522041098333, + "compression/movement_sparsity/importance_threshold": -0.0030658694802751943, + "compression/movement_sparsity/linear_layer_sparsity": 0.5140780419600965, + "compression/movement_sparsity/model_sparsity": 0.49641787972972945, + "compression_loss": 60.24950408935547, + "distillation_loss": 1.5397720336914062, + "epoch": 2.72, + "learning_rate": 4.042453273222504e-05, + "loss": 61.928, + "step": 3222, + "task_loss": 1.3492388725280762 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5627389510584361, + "compression/movement_sparsity/importance_threshold": -0.0030624604337229355, + "compression/movement_sparsity/linear_layer_sparsity": 0.5147180120371133, + "compression/movement_sparsity/model_sparsity": 0.49703586486582246, + "compression_loss": 60.301292419433594, + "distillation_loss": 2.28843355178833, + "epoch": 2.72, + "learning_rate": 4.041983657368273e-05, + "loss": 62.175, + "step": 3223, + "task_loss": 1.7072094678878784 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5632253370526996, + "compression/movement_sparsity/importance_threshold": -0.003059053915199151, + "compression/movement_sparsity/linear_layer_sparsity": 0.5153410259477519, + "compression/movement_sparsity/model_sparsity": 0.4976374763320156, + "compression_loss": 60.35306167602539, + "distillation_loss": 2.6227030754089355, + "epoch": 2.72, + "learning_rate": 4.041514041514042e-05, + "loss": 62.2472, + "step": 3224, + "task_loss": 1.4026628732681274 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5637113622265093, + "compression/movement_sparsity/importance_threshold": -0.0030556499237661414, + "compression/movement_sparsity/linear_layer_sparsity": 0.5159235692334344, + "compression/movement_sparsity/model_sparsity": 0.4982000074637221, + "compression_loss": 60.404754638671875, + "distillation_loss": 2.2320141792297363, + "epoch": 2.73, + "learning_rate": 4.041044425659811e-05, + "loss": 62.3531, + "step": 3225, + "task_loss": 1.5864291191101074 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5641970267137502, + "compression/movement_sparsity/importance_threshold": -0.003052248458486214, + "compression/movement_sparsity/linear_layer_sparsity": 0.5167145111968888, + "compression/movement_sparsity/model_sparsity": 0.4989637781375098, + "compression_loss": 60.45641326904297, + "distillation_loss": 1.2258720397949219, + "epoch": 2.73, + "learning_rate": 4.040574809805579e-05, + "loss": 62.1451, + "step": 3226, + "task_loss": 0.7619501948356628 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5646823306483073, + "compression/movement_sparsity/importance_threshold": -0.003048849518421672, + "compression/movement_sparsity/linear_layer_sparsity": 0.5174092609000247, + "compression/movement_sparsity/model_sparsity": 0.4996346610510429, + "compression_loss": 60.508087158203125, + "distillation_loss": 2.245507001876831, + "epoch": 2.73, + "learning_rate": 4.0401051939513476e-05, + "loss": 62.4336, + "step": 3227, + "task_loss": 1.2263633012771606 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5651672741640659, + "compression/movement_sparsity/importance_threshold": -0.0030454531026348182, + "compression/movement_sparsity/linear_layer_sparsity": 0.5179392901514388, + "compression/movement_sparsity/model_sparsity": 0.5001464821671102, + "compression_loss": 60.5596809387207, + "distillation_loss": 1.8125187158584595, + "epoch": 2.73, + "learning_rate": 4.039635578097117e-05, + "loss": 62.4666, + "step": 3228, + "task_loss": 1.4509414434432983 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.565651857394911, + "compression/movement_sparsity/importance_threshold": -0.003042059210187959, + "compression/movement_sparsity/linear_layer_sparsity": 0.5185626617871065, + "compression/movement_sparsity/model_sparsity": 0.5007484390693773, + "compression_loss": 60.611244201660156, + "distillation_loss": 1.2947171926498413, + "epoch": 2.73, + "learning_rate": 4.0391659622428855e-05, + "loss": 61.9607, + "step": 3229, + "task_loss": 0.3889021873474121 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5661360804747277, + "compression/movement_sparsity/importance_threshold": -0.003038667840143397, + "compression/movement_sparsity/linear_layer_sparsity": 0.5192293539237952, + "compression/movement_sparsity/model_sparsity": 0.5013922282801858, + "compression_loss": 60.662784576416016, + "distillation_loss": 2.9191157817840576, + "epoch": 2.73, + "learning_rate": 4.038696346388654e-05, + "loss": 62.8384, + "step": 3230, + "task_loss": 1.829105019569397 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5666199435374015, + "compression/movement_sparsity/importance_threshold": -0.0030352789915634344, + "compression/movement_sparsity/linear_layer_sparsity": 0.5198588665057954, + "compression/movement_sparsity/model_sparsity": 0.502000115168387, + "compression_loss": 60.714298248291016, + "distillation_loss": 1.5689562559127808, + "epoch": 2.73, + "learning_rate": 4.038226730534423e-05, + "loss": 62.389, + "step": 3231, + "task_loss": 1.5665154457092285 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5671034467168172, + "compression/movement_sparsity/importance_threshold": -0.0030318926635103773, + "compression/movement_sparsity/linear_layer_sparsity": 0.5204391203512917, + "compression/movement_sparsity/model_sparsity": 0.5025604355092209, + "compression_loss": 60.765750885009766, + "distillation_loss": 2.257338523864746, + "epoch": 2.73, + "learning_rate": 4.037757114680192e-05, + "loss": 62.7614, + "step": 3232, + "task_loss": 1.7667593955993652 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5675865901468602, + "compression/movement_sparsity/importance_threshold": -0.0030285088550465293, + "compression/movement_sparsity/linear_layer_sparsity": 0.5209478291909729, + "compression/movement_sparsity/model_sparsity": 0.5030516686352876, + "compression_loss": 60.81721115112305, + "distillation_loss": 1.8851289749145508, + "epoch": 2.73, + "learning_rate": 4.037287498825961e-05, + "loss": 62.406, + "step": 3233, + "task_loss": 1.1357941627502441 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5680693739614154, + "compression/movement_sparsity/importance_threshold": -0.0030251275652341935, + "compression/movement_sparsity/linear_layer_sparsity": 0.5216459653577173, + "compression/movement_sparsity/model_sparsity": 0.5037258216769864, + "compression_loss": 60.868629455566406, + "distillation_loss": 1.6463727951049805, + "epoch": 2.73, + "learning_rate": 4.0368178829717294e-05, + "loss": 62.7615, + "step": 3234, + "task_loss": 0.6807820796966553 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5685517982943682, + "compression/movement_sparsity/importance_threshold": -0.0030217487931356738, + "compression/movement_sparsity/linear_layer_sparsity": 0.5221901128236124, + "compression/movement_sparsity/model_sparsity": 0.5042512760034346, + "compression_loss": 60.91997146606445, + "distillation_loss": 2.4444820880889893, + "epoch": 2.73, + "learning_rate": 4.036348267117498e-05, + "loss": 62.8409, + "step": 3235, + "task_loss": 1.5648764371871948 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5690338632796035, + "compression/movement_sparsity/importance_threshold": -0.0030183725378132758, + "compression/movement_sparsity/linear_layer_sparsity": 0.522681877421083, + "compression/movement_sparsity/model_sparsity": 0.5047261469741372, + "compression_loss": 60.97130584716797, + "distillation_loss": 1.3508840799331665, + "epoch": 2.73, + "learning_rate": 4.035878651263267e-05, + "loss": 62.5742, + "step": 3236, + "task_loss": 0.8312810659408569 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5695155690510064, + "compression/movement_sparsity/importance_threshold": -0.0030149987983293033, + "compression/movement_sparsity/linear_layer_sparsity": 0.5231873309629996, + "compression/movement_sparsity/model_sparsity": 0.505214236631932, + "compression_loss": 61.022579193115234, + "distillation_loss": 2.256608247756958, + "epoch": 2.74, + "learning_rate": 4.035409035409036e-05, + "loss": 63.2711, + "step": 3237, + "task_loss": 0.9562850594520569 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5699969157424623, + "compression/movement_sparsity/importance_threshold": -0.0030116275737460596, + "compression/movement_sparsity/linear_layer_sparsity": 0.5237235846299196, + "compression/movement_sparsity/model_sparsity": 0.5057320683356841, + "compression_loss": 61.07379913330078, + "distillation_loss": 1.7904212474822998, + "epoch": 2.74, + "learning_rate": 4.0349394195548046e-05, + "loss": 63.194, + "step": 3238, + "task_loss": 1.2051401138305664 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5704779034878561, + "compression/movement_sparsity/importance_threshold": -0.0030082588631258475, + "compression/movement_sparsity/linear_layer_sparsity": 0.5243247892379522, + "compression/movement_sparsity/model_sparsity": 0.506312619715909, + "compression_loss": 61.12504196166992, + "distillation_loss": 2.2610394954681396, + "epoch": 2.74, + "learning_rate": 4.034469803700573e-05, + "loss": 62.9409, + "step": 3239, + "task_loss": 0.9777759313583374 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5709585324210729, + "compression/movement_sparsity/importance_threshold": -0.0030048926655309744, + "compression/movement_sparsity/linear_layer_sparsity": 0.5249596438470532, + "compression/movement_sparsity/model_sparsity": 0.5069256651161461, + "compression_loss": 61.17618942260742, + "distillation_loss": 1.5760114192962646, + "epoch": 2.74, + "learning_rate": 4.034000187846342e-05, + "loss": 62.9165, + "step": 3240, + "task_loss": 1.9501545429229736 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5714388026759982, + "compression/movement_sparsity/importance_threshold": -0.00300152898002374, + "compression/movement_sparsity/linear_layer_sparsity": 0.5255809049050493, + "compression/movement_sparsity/model_sparsity": 0.5075255839455776, + "compression_loss": 61.22731018066406, + "distillation_loss": 1.9022250175476074, + "epoch": 2.74, + "learning_rate": 4.0335305719921105e-05, + "loss": 62.6099, + "step": 3241, + "task_loss": 0.9030871987342834 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5719187143865168, + "compression/movement_sparsity/importance_threshold": -0.0029981678056664515, + "compression/movement_sparsity/linear_layer_sparsity": 0.5261873084501711, + "compression/movement_sparsity/model_sparsity": 0.5081111556634089, + "compression_loss": 61.27836608886719, + "distillation_loss": 1.5674872398376465, + "epoch": 2.74, + "learning_rate": 4.03306095613788e-05, + "loss": 62.6458, + "step": 3242, + "task_loss": 0.7744656801223755 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.572398267686514, + "compression/movement_sparsity/importance_threshold": -0.002994809141521411, + "compression/movement_sparsity/linear_layer_sparsity": 0.526917162902827, + "compression/movement_sparsity/model_sparsity": 0.5088159373703212, + "compression_loss": 61.329410552978516, + "distillation_loss": 1.3554848432540894, + "epoch": 2.74, + "learning_rate": 4.032591340283648e-05, + "loss": 63.06, + "step": 3243, + "task_loss": 1.7756683826446533 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.572877462709875, + "compression/movement_sparsity/importance_threshold": -0.0029914529866509228, + "compression/movement_sparsity/linear_layer_sparsity": 0.5275406060835005, + "compression/movement_sparsity/model_sparsity": 0.5094179633598029, + "compression_loss": 61.38042068481445, + "distillation_loss": 1.85811448097229, + "epoch": 2.74, + "learning_rate": 4.032121724429417e-05, + "loss": 63.3787, + "step": 3244, + "task_loss": 1.9419097900390625 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5733562995904848, + "compression/movement_sparsity/importance_threshold": -0.0029880993401172912, + "compression/movement_sparsity/linear_layer_sparsity": 0.5282233839223299, + "compression/movement_sparsity/model_sparsity": 0.5100772856793984, + "compression_loss": 61.43135452270508, + "distillation_loss": 2.0800514221191406, + "epoch": 2.74, + "learning_rate": 4.031652108575186e-05, + "loss": 63.5189, + "step": 3245, + "task_loss": 2.283797025680542 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5738347784622286, + "compression/movement_sparsity/importance_threshold": -0.0029847482009828213, + "compression/movement_sparsity/linear_layer_sparsity": 0.5288371923755537, + "compression/movement_sparsity/model_sparsity": 0.5106700079239581, + "compression_loss": 61.48225784301758, + "distillation_loss": 2.069443702697754, + "epoch": 2.74, + "learning_rate": 4.0311824927209544e-05, + "loss": 63.3955, + "step": 3246, + "task_loss": 1.1308705806732178 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5743128994589916, + "compression/movement_sparsity/importance_threshold": -0.002981399568309815, + "compression/movement_sparsity/linear_layer_sparsity": 0.5296742927919266, + "compression/movement_sparsity/model_sparsity": 0.5114783513658064, + "compression_loss": 61.53312301635742, + "distillation_loss": 2.043513298034668, + "epoch": 2.74, + "learning_rate": 4.030712876866723e-05, + "loss": 63.1455, + "step": 3247, + "task_loss": 1.7300862073898315 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5747906627146587, + "compression/movement_sparsity/importance_threshold": -0.0029780534411605773, + "compression/movement_sparsity/linear_layer_sparsity": 0.5302532588273182, + "compression/movement_sparsity/model_sparsity": 0.5120374281367746, + "compression_loss": 61.5839729309082, + "distillation_loss": 1.3611375093460083, + "epoch": 2.75, + "learning_rate": 4.0302432610124916e-05, + "loss": 63.099, + "step": 3248, + "task_loss": 0.7622065544128418 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5752680683631153, + "compression/movement_sparsity/importance_threshold": -0.002974709818597413, + "compression/movement_sparsity/linear_layer_sparsity": 0.5308463430771908, + "compression/movement_sparsity/model_sparsity": 0.5126101381181235, + "compression_loss": 61.634769439697266, + "distillation_loss": 2.166154146194458, + "epoch": 2.75, + "learning_rate": 4.029773645158261e-05, + "loss": 63.5365, + "step": 3249, + "task_loss": 1.5577365159988403 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5757451165382462, + "compression/movement_sparsity/importance_threshold": -0.0029713686996826263, + "compression/movement_sparsity/linear_layer_sparsity": 0.531406790880244, + "compression/movement_sparsity/model_sparsity": 0.5131513328150027, + "compression_loss": 61.685523986816406, + "distillation_loss": 2.445378541946411, + "epoch": 2.75, + "learning_rate": 4.0293040293040296e-05, + "loss": 63.3946, + "step": 3250, + "task_loss": 1.5867260694503784 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.576221807373937, + "compression/movement_sparsity/importance_threshold": -0.0029680300834785183, + "compression/movement_sparsity/linear_layer_sparsity": 0.5321133574335071, + "compression/movement_sparsity/model_sparsity": 0.5138336266335082, + "compression_loss": 61.73617172241211, + "distillation_loss": 1.8073227405548096, + "epoch": 2.75, + "learning_rate": 4.028834413449798e-05, + "loss": 63.4321, + "step": 3251, + "task_loss": 1.873195767402649 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5766981410040727, + "compression/movement_sparsity/importance_threshold": -0.0029646939690473953, + "compression/movement_sparsity/linear_layer_sparsity": 0.5328044106446759, + "compression/movement_sparsity/model_sparsity": 0.514500940040945, + "compression_loss": 61.78681564331055, + "distillation_loss": 2.415198564529419, + "epoch": 2.75, + "learning_rate": 4.028364797595567e-05, + "loss": 63.9861, + "step": 3252, + "task_loss": 3.182394504547119 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5771741175625382, + "compression/movement_sparsity/importance_threshold": -0.002961360355451561, + "compression/movement_sparsity/linear_layer_sparsity": 0.5334121377724053, + "compression/movement_sparsity/model_sparsity": 0.5150877898722496, + "compression_loss": 61.83740234375, + "distillation_loss": 2.565702199935913, + "epoch": 2.75, + "learning_rate": 4.0278951817413355e-05, + "loss": 63.6319, + "step": 3253, + "task_loss": 1.5635552406311035 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.577649737183219, + "compression/movement_sparsity/importance_threshold": -0.002958029241753318, + "compression/movement_sparsity/linear_layer_sparsity": 0.5341089622807098, + "compression/movement_sparsity/model_sparsity": 0.515760676315011, + "compression_loss": 61.88798141479492, + "distillation_loss": 3.2892961502075195, + "epoch": 2.75, + "learning_rate": 4.027425565887105e-05, + "loss": 64.5098, + "step": 3254, + "task_loss": 2.893268346786499 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.578125, + "compression/movement_sparsity/importance_threshold": -0.0029547006270149723, + "compression/movement_sparsity/linear_layer_sparsity": 0.5348495127117351, + "compression/movement_sparsity/model_sparsity": 0.516475786560531, + "compression_loss": 61.938507080078125, + "distillation_loss": 2.1112589836120605, + "epoch": 2.75, + "learning_rate": 4.0269559500328734e-05, + "loss": 63.6508, + "step": 3255, + "task_loss": 0.6777961850166321 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5785999061467664, + "compression/movement_sparsity/importance_threshold": -0.002951374510298827, + "compression/movement_sparsity/linear_layer_sparsity": 0.5354863825051666, + "compression/movement_sparsity/model_sparsity": 0.5170907779173173, + "compression_loss": 61.98895263671875, + "distillation_loss": 1.5397002696990967, + "epoch": 2.75, + "learning_rate": 4.026486334178642e-05, + "loss": 63.4923, + "step": 3256, + "task_loss": 1.7349225282669067 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5790744557574032, + "compression/movement_sparsity/importance_threshold": -0.0029480508906671856, + "compression/movement_sparsity/linear_layer_sparsity": 0.536085297673013, + "compression/movement_sparsity/model_sparsity": 0.5176691185066697, + "compression_loss": 62.03940963745117, + "distillation_loss": 2.716867685317993, + "epoch": 2.75, + "learning_rate": 4.026016718324411e-05, + "loss": 64.047, + "step": 3257, + "task_loss": 2.048130512237549 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5795486489657957, + "compression/movement_sparsity/importance_threshold": -0.002944729767182354, + "compression/movement_sparsity/linear_layer_sparsity": 0.5366919158531522, + "compression/movement_sparsity/model_sparsity": 0.5182548974861454, + "compression_loss": 62.089839935302734, + "distillation_loss": 1.693877935409546, + "epoch": 2.75, + "learning_rate": 4.025547102470179e-05, + "loss": 64.1566, + "step": 3258, + "task_loss": 1.4377888441085815 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5800224859058292, + "compression/movement_sparsity/importance_threshold": -0.0029414111389066333, + "compression/movement_sparsity/linear_layer_sparsity": 0.5373102793384128, + "compression/movement_sparsity/model_sparsity": 0.5188520182833787, + "compression_loss": 62.14025115966797, + "distillation_loss": 2.2593026161193848, + "epoch": 2.75, + "learning_rate": 4.0250774866159486e-05, + "loss": 64.2672, + "step": 3259, + "task_loss": 1.5536476373672485 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5804959667113884, + "compression/movement_sparsity/importance_threshold": -0.0029380950049023302, + "compression/movement_sparsity/linear_layer_sparsity": 0.537951119879667, + "compression/movement_sparsity/model_sparsity": 0.5194708439805847, + "compression_loss": 62.19060134887695, + "distillation_loss": 3.4538440704345703, + "epoch": 2.76, + "learning_rate": 4.0246078707617166e-05, + "loss": 64.6068, + "step": 3260, + "task_loss": 1.847519040107727 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.580969091516359, + "compression/movement_sparsity/importance_threshold": -0.0029347813642317455, + "compression/movement_sparsity/linear_layer_sparsity": 0.538595764230397, + "compression/movement_sparsity/model_sparsity": 0.5200933428147092, + "compression_loss": 62.24092102050781, + "distillation_loss": 2.5441994667053223, + "epoch": 2.76, + "learning_rate": 4.024138254907486e-05, + "loss": 64.0328, + "step": 3261, + "task_loss": 2.2425315380096436 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5814418604546254, + "compression/movement_sparsity/importance_threshold": -0.0029314702159571873, + "compression/movement_sparsity/linear_layer_sparsity": 0.5392170491367284, + "compression/movement_sparsity/model_sparsity": 0.5206932846732121, + "compression_loss": 62.29121398925781, + "distillation_loss": 2.041661500930786, + "epoch": 2.76, + "learning_rate": 4.0236686390532545e-05, + "loss": 64.5341, + "step": 3262, + "task_loss": 2.105586051940918 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5819142736600736, + "compression/movement_sparsity/importance_threshold": -0.002928161559140955, + "compression/movement_sparsity/linear_layer_sparsity": 0.539761208526791, + "compression/movement_sparsity/model_sparsity": 0.521218750514196, + "compression_loss": 62.341468811035156, + "distillation_loss": 1.197457194328308, + "epoch": 2.76, + "learning_rate": 4.023199023199024e-05, + "loss": 63.9823, + "step": 3263, + "task_loss": 0.9937968850135803 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.582386331266588, + "compression/movement_sparsity/importance_threshold": -0.0029248553928453574, + "compression/movement_sparsity/linear_layer_sparsity": 0.5405459499230748, + "compression/movement_sparsity/model_sparsity": 0.5219765336293705, + "compression_loss": 62.39168167114258, + "distillation_loss": 1.215428113937378, + "epoch": 2.76, + "learning_rate": 4.0227294073447925e-05, + "loss": 64.2882, + "step": 3264, + "task_loss": 1.1487324237823486 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5828580334080542, + "compression/movement_sparsity/importance_threshold": -0.0029215517161326953, + "compression/movement_sparsity/linear_layer_sparsity": 0.5410592495672958, + "compression/movement_sparsity/model_sparsity": 0.5224721998517181, + "compression_loss": 62.441856384277344, + "distillation_loss": 2.4234817028045654, + "epoch": 2.76, + "learning_rate": 4.0222597914905604e-05, + "loss": 64.4835, + "step": 3265, + "task_loss": 1.4215269088745117 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5833293802183572, + "compression/movement_sparsity/importance_threshold": -0.0029182505280652728, + "compression/movement_sparsity/linear_layer_sparsity": 0.5417021649137186, + "compression/movement_sparsity/model_sparsity": 0.5230930290781524, + "compression_loss": 62.491973876953125, + "distillation_loss": 2.471052408218384, + "epoch": 2.76, + "learning_rate": 4.02179017563633e-05, + "loss": 64.6163, + "step": 3266, + "task_loss": 1.6392581462860107 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.583800371831382, + "compression/movement_sparsity/importance_threshold": -0.0029149518277053962, + "compression/movement_sparsity/linear_layer_sparsity": 0.5423657448426544, + "compression/movement_sparsity/model_sparsity": 0.5237338129951186, + "compression_loss": 62.54207229614258, + "distillation_loss": 1.6593048572540283, + "epoch": 2.76, + "learning_rate": 4.0213205597820984e-05, + "loss": 64.208, + "step": 3267, + "task_loss": 0.7882299423217773 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5842710083810142, + "compression/movement_sparsity/importance_threshold": -0.0029116556141153644, + "compression/movement_sparsity/linear_layer_sparsity": 0.5430153019504504, + "compression/movement_sparsity/model_sparsity": 0.5243610558179904, + "compression_loss": 62.5921630859375, + "distillation_loss": 2.1852846145629883, + "epoch": 2.76, + "learning_rate": 4.020850943927868e-05, + "loss": 64.5344, + "step": 3268, + "task_loss": 1.5038312673568726 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5847412900011383, + "compression/movement_sparsity/importance_threshold": -0.002908361886357488, + "compression/movement_sparsity/linear_layer_sparsity": 0.5436665284417155, + "compression/movement_sparsity/model_sparsity": 0.5249899106758734, + "compression_loss": 62.64218521118164, + "distillation_loss": 3.0454206466674805, + "epoch": 2.76, + "learning_rate": 4.0203813280736356e-05, + "loss": 64.8583, + "step": 3269, + "task_loss": 2.502751350402832 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.58521121682564, + "compression/movement_sparsity/importance_threshold": -0.0029050706434940654, + "compression/movement_sparsity/linear_layer_sparsity": 0.5444049682950691, + "compression/movement_sparsity/model_sparsity": 0.5257029828485578, + "compression_loss": 62.69215393066406, + "distillation_loss": 1.3584942817687988, + "epoch": 2.76, + "learning_rate": 4.019911712219405e-05, + "loss": 64.272, + "step": 3270, + "task_loss": 1.1215400695800781 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5856807889884039, + "compression/movement_sparsity/importance_threshold": -0.002901781884587405, + "compression/movement_sparsity/linear_layer_sparsity": 0.5451045472860976, + "compression/movement_sparsity/model_sparsity": 0.5263785291490877, + "compression_loss": 62.742103576660156, + "distillation_loss": 1.1327388286590576, + "epoch": 2.76, + "learning_rate": 4.0194420963651736e-05, + "loss": 64.9521, + "step": 3271, + "task_loss": 1.3774384260177612 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5861500066233158, + "compression/movement_sparsity/importance_threshold": -0.002898495608699806, + "compression/movement_sparsity/linear_layer_sparsity": 0.5458082520391281, + "compression/movement_sparsity/model_sparsity": 0.5270580594790025, + "compression_loss": 62.791954040527344, + "distillation_loss": 2.9390554428100586, + "epoch": 2.77, + "learning_rate": 4.018972480510942e-05, + "loss": 65.0331, + "step": 3272, + "task_loss": 2.722241163253784 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5866188698642603, + "compression/movement_sparsity/importance_threshold": -0.002895211814893578, + "compression/movement_sparsity/linear_layer_sparsity": 0.5464149059917702, + "compression/movement_sparsity/model_sparsity": 0.5276438730020856, + "compression_loss": 62.841827392578125, + "distillation_loss": 2.035230875015259, + "epoch": 2.77, + "learning_rate": 4.018502864656711e-05, + "loss": 65.0773, + "step": 3273, + "task_loss": 1.7724132537841797 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5870873788451227, + "compression/movement_sparsity/importance_threshold": -0.00289193050223102, + "compression/movement_sparsity/linear_layer_sparsity": 0.5470722257326972, + "compression/movement_sparsity/model_sparsity": 0.5282786117877596, + "compression_loss": 62.891624450683594, + "distillation_loss": 2.7584996223449707, + "epoch": 2.77, + "learning_rate": 4.0180332488024795e-05, + "loss": 65.0842, + "step": 3274, + "task_loss": 1.3805007934570312 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5875555336997882, + "compression/movement_sparsity/importance_threshold": -0.002888651669774439, + "compression/movement_sparsity/linear_layer_sparsity": 0.5478709064809473, + "compression/movement_sparsity/model_sparsity": 0.5290498553952779, + "compression_loss": 62.9413948059082, + "distillation_loss": 1.578752040863037, + "epoch": 2.77, + "learning_rate": 4.017563632948249e-05, + "loss": 64.3718, + "step": 3275, + "task_loss": 1.3385707139968872 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.588023334562142, + "compression/movement_sparsity/importance_threshold": -0.0028853753165861374, + "compression/movement_sparsity/linear_layer_sparsity": 0.5485831727863403, + "compression/movement_sparsity/model_sparsity": 0.5297376531618933, + "compression_loss": 62.99110412597656, + "distillation_loss": 2.244793176651001, + "epoch": 2.77, + "learning_rate": 4.0170940170940174e-05, + "loss": 64.8635, + "step": 3276, + "task_loss": 1.7686864137649536 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5884907815660689, + "compression/movement_sparsity/importance_threshold": -0.00288210144172842, + "compression/movement_sparsity/linear_layer_sparsity": 0.54919685007372, + "compression/movement_sparsity/model_sparsity": 0.5303302487465593, + "compression_loss": 63.040802001953125, + "distillation_loss": 2.023423671722412, + "epoch": 2.77, + "learning_rate": 4.016624401239786e-05, + "loss": 65.1005, + "step": 3277, + "task_loss": 0.7540304064750671 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5889578748454543, + "compression/movement_sparsity/importance_threshold": -0.0028788300442635925, + "compression/movement_sparsity/linear_layer_sparsity": 0.5497256869083705, + "compression/movement_sparsity/model_sparsity": 0.5308409184090472, + "compression_loss": 63.0904426574707, + "distillation_loss": 1.6696819067001343, + "epoch": 2.77, + "learning_rate": 4.016154785385555e-05, + "loss": 64.8621, + "step": 3278, + "task_loss": 0.9298416376113892 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5894246145341837, + "compression/movement_sparsity/importance_threshold": -0.0028755611232539526, + "compression/movement_sparsity/linear_layer_sparsity": 0.5503212394609437, + "compression/movement_sparsity/model_sparsity": 0.5314160118993055, + "compression_loss": 63.1400260925293, + "distillation_loss": 2.2118468284606934, + "epoch": 2.77, + "learning_rate": 4.0156851695313233e-05, + "loss": 65.2393, + "step": 3279, + "task_loss": 1.3641406297683716 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5898910007661413, + "compression/movement_sparsity/importance_threshold": -0.0028722946777618137, + "compression/movement_sparsity/linear_layer_sparsity": 0.5508957935543101, + "compression/movement_sparsity/model_sparsity": 0.5319708282920297, + "compression_loss": 63.1895637512207, + "distillation_loss": 2.0860366821289062, + "epoch": 2.77, + "learning_rate": 4.0152155536770927e-05, + "loss": 64.7543, + "step": 3280, + "task_loss": 1.0176345109939575 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5903570336752134, + "compression/movement_sparsity/importance_threshold": -0.0028690307068494694, + "compression/movement_sparsity/linear_layer_sparsity": 0.5515128573052984, + "compression/movement_sparsity/model_sparsity": 0.5325666940048613, + "compression_loss": 63.23905944824219, + "distillation_loss": 2.1340889930725098, + "epoch": 2.77, + "learning_rate": 4.014745937822861e-05, + "loss": 65.3394, + "step": 3281, + "task_loss": 1.4301774501800537 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5908227133952841, + "compression/movement_sparsity/importance_threshold": -0.002865769209579233, + "compression/movement_sparsity/linear_layer_sparsity": 0.5521910085670851, + "compression/movement_sparsity/model_sparsity": 0.5332215486845685, + "compression_loss": 63.288475036621094, + "distillation_loss": 2.6284728050231934, + "epoch": 2.77, + "learning_rate": 4.01427632196863e-05, + "loss": 65.4703, + "step": 3282, + "task_loss": 2.27504563331604 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5912880400602394, + "compression/movement_sparsity/importance_threshold": -0.0028625101850134017, + "compression/movement_sparsity/linear_layer_sparsity": 0.5528403391156961, + "compression/movement_sparsity/model_sparsity": 0.5338485727312602, + "compression_loss": 63.33785629272461, + "distillation_loss": 1.3243110179901123, + "epoch": 2.77, + "learning_rate": 4.0138067061143986e-05, + "loss": 64.9801, + "step": 3283, + "task_loss": 1.5111138820648193 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5917530138039635, + "compression/movement_sparsity/importance_threshold": -0.002859253632214286, + "compression/movement_sparsity/linear_layer_sparsity": 0.5535664016830438, + "compression/movement_sparsity/model_sparsity": 0.5345496928157898, + "compression_loss": 63.3872184753418, + "distillation_loss": 2.197932720184326, + "epoch": 2.78, + "learning_rate": 4.013337090260167e-05, + "loss": 65.3257, + "step": 3284, + "task_loss": 1.1503630876541138 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5922176347603425, + "compression/movement_sparsity/importance_threshold": -0.0028559995502441824, + "compression/movement_sparsity/linear_layer_sparsity": 0.55432217927614, + "compression/movement_sparsity/model_sparsity": 0.5352795071235193, + "compression_loss": 63.436546325683594, + "distillation_loss": 1.4603022336959839, + "epoch": 2.78, + "learning_rate": 4.0128674744059365e-05, + "loss": 64.9047, + "step": 3285, + "task_loss": 0.7153134942054749 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5926819030632609, + "compression/movement_sparsity/importance_threshold": -0.0028527479381654005, + "compression/movement_sparsity/linear_layer_sparsity": 0.554878215137168, + "compression/movement_sparsity/model_sparsity": 0.5358164414421546, + "compression_loss": 63.485843658447266, + "distillation_loss": 2.9051706790924072, + "epoch": 2.78, + "learning_rate": 4.0123978585517045e-05, + "loss": 65.7868, + "step": 3286, + "task_loss": 2.030365467071533 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5931458188466039, + "compression/movement_sparsity/importance_threshold": -0.0028494987950402435, + "compression/movement_sparsity/linear_layer_sparsity": 0.5555114599836382, + "compression/movement_sparsity/model_sparsity": 0.5364279323800595, + "compression_loss": 63.53511047363281, + "distillation_loss": 1.8905597925186157, + "epoch": 2.78, + "learning_rate": 4.011928242697474e-05, + "loss": 65.2966, + "step": 3287, + "task_loss": 1.7784000635147095 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5936093822442572, + "compression/movement_sparsity/importance_threshold": -0.002846252119931011, + "compression/movement_sparsity/linear_layer_sparsity": 0.5560774167521391, + "compression/movement_sparsity/model_sparsity": 0.5369744467924757, + "compression_loss": 63.58433532714844, + "distillation_loss": 1.6999075412750244, + "epoch": 2.78, + "learning_rate": 4.0114586268432424e-05, + "loss": 65.319, + "step": 3288, + "task_loss": 0.8752533793449402 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5940725933901052, + "compression/movement_sparsity/importance_threshold": -0.002843007911900013, + "compression/movement_sparsity/linear_layer_sparsity": 0.556637709541013, + "compression/movement_sparsity/model_sparsity": 0.5375154918003897, + "compression_loss": 63.63349151611328, + "distillation_loss": 1.7299723625183105, + "epoch": 2.78, + "learning_rate": 4.010989010989011e-05, + "loss": 65.6627, + "step": 3289, + "task_loss": 0.3448010981082916 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5945354524180337, + "compression/movement_sparsity/importance_threshold": -0.002839766170009547, + "compression/movement_sparsity/linear_layer_sparsity": 0.5572899257381918, + "compression/movement_sparsity/model_sparsity": 0.5381453023647437, + "compression_loss": 63.682621002197266, + "distillation_loss": 2.7418320178985596, + "epoch": 2.78, + "learning_rate": 4.01051939513478e-05, + "loss": 65.4854, + "step": 3290, + "task_loss": 1.7145569324493408 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5949979594619271, + "compression/movement_sparsity/importance_threshold": -0.002836526893321925, + "compression/movement_sparsity/linear_layer_sparsity": 0.5578565502600804, + "compression/movement_sparsity/model_sparsity": 0.5386924615911645, + "compression_loss": 63.73171615600586, + "distillation_loss": 2.606372356414795, + "epoch": 2.78, + "learning_rate": 4.010049779280548e-05, + "loss": 65.9603, + "step": 3291, + "task_loss": 1.443753957748413 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5954601146556714, + "compression/movement_sparsity/importance_threshold": -0.002833290080899444, + "compression/movement_sparsity/linear_layer_sparsity": 0.5584307466284177, + "compression/movement_sparsity/model_sparsity": 0.5392469325478148, + "compression_loss": 63.780799865722656, + "distillation_loss": 2.7498373985290527, + "epoch": 2.78, + "learning_rate": 4.0095801634263176e-05, + "loss": 65.5984, + "step": 3292, + "task_loss": 2.266296863555908 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.595921918133151, + "compression/movement_sparsity/importance_threshold": -0.002830055731804411, + "compression/movement_sparsity/linear_layer_sparsity": 0.558974524445116, + "compression/movement_sparsity/model_sparsity": 0.5397720299236533, + "compression_loss": 63.829811096191406, + "distillation_loss": 2.164581775665283, + "epoch": 2.78, + "learning_rate": 4.009110547572086e-05, + "loss": 65.972, + "step": 3293, + "task_loss": 0.7952256202697754 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5963833700282513, + "compression/movement_sparsity/importance_threshold": -0.0028268238450991313, + "compression/movement_sparsity/linear_layer_sparsity": 0.5596265737039479, + "compression/movement_sparsity/model_sparsity": 0.5404016792845061, + "compression_loss": 63.87882995605469, + "distillation_loss": 2.37937068939209, + "epoch": 2.78, + "learning_rate": 4.0086409317178556e-05, + "loss": 66.034, + "step": 3294, + "task_loss": 0.9546102285385132 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5968444704748577, + "compression/movement_sparsity/importance_threshold": -0.002823594419845905, + "compression/movement_sparsity/linear_layer_sparsity": 0.5601694095114029, + "compression/movement_sparsity/model_sparsity": 0.5409258670120167, + "compression_loss": 63.92778015136719, + "distillation_loss": 2.147037982940674, + "epoch": 2.78, + "learning_rate": 4.0081713158636235e-05, + "loss": 66.2352, + "step": 3295, + "task_loss": 1.916335940361023 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5973052196068549, + "compression/movement_sparsity/importance_threshold": -0.0028203674551070396, + "compression/movement_sparsity/linear_layer_sparsity": 0.5607746564122641, + "compression/movement_sparsity/model_sparsity": 0.541510321819876, + "compression_loss": 63.97677230834961, + "distillation_loss": 2.069258451461792, + "epoch": 2.79, + "learning_rate": 4.007701700009392e-05, + "loss": 65.986, + "step": 3296, + "task_loss": 1.8564364910125732 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5977656175581283, + "compression/movement_sparsity/importance_threshold": -0.002817142949944838, + "compression/movement_sparsity/linear_layer_sparsity": 0.5614591990279036, + "compression/movement_sparsity/model_sparsity": 0.5421713482907691, + "compression_loss": 64.02571105957031, + "distillation_loss": 2.293659210205078, + "epoch": 2.79, + "learning_rate": 4.0072320841551615e-05, + "loss": 66.058, + "step": 3297, + "task_loss": 2.4276583194732666 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.598225664462563, + "compression/movement_sparsity/importance_threshold": -0.0028139209034216033, + "compression/movement_sparsity/linear_layer_sparsity": 0.5621149090061998, + "compression/movement_sparsity/model_sparsity": 0.5428045326141109, + "compression_loss": 64.07460021972656, + "distillation_loss": 2.927645683288574, + "epoch": 2.79, + "learning_rate": 4.00676246830093e-05, + "loss": 66.5742, + "step": 3298, + "task_loss": 2.8492398262023926 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5986853604540443, + "compression/movement_sparsity/importance_threshold": -0.0028107013145996394, + "compression/movement_sparsity/linear_layer_sparsity": 0.5627901030744129, + "compression/movement_sparsity/model_sparsity": 0.5434565316889409, + "compression_loss": 64.12342834472656, + "distillation_loss": 1.5884689092636108, + "epoch": 2.79, + "learning_rate": 4.006292852446699e-05, + "loss": 65.89, + "step": 3299, + "task_loss": 0.5364198684692383 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.599144705666457, + "compression/movement_sparsity/importance_threshold": -0.002807484182541253, + "compression/movement_sparsity/linear_layer_sparsity": 0.5633472955797015, + "compression/movement_sparsity/model_sparsity": 0.5439945829175482, + "compression_loss": 64.17223358154297, + "distillation_loss": 1.506592869758606, + "epoch": 2.79, + "learning_rate": 4.0058232365924674e-05, + "loss": 66.099, + "step": 3300, + "task_loss": 1.3622877597808838 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.5996037002336866, + "compression/movement_sparsity/importance_threshold": -0.002804269506308743, + "compression/movement_sparsity/linear_layer_sparsity": 0.5639174735035456, + "compression/movement_sparsity/model_sparsity": 0.5445451734756358, + "compression_loss": 64.22098541259766, + "distillation_loss": 1.782759428024292, + "epoch": 2.79, + "learning_rate": 4.005353620738237e-05, + "loss": 66.5371, + "step": 3301, + "task_loss": 1.2197626829147339 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6000623442896178, + "compression/movement_sparsity/importance_threshold": -0.0028010572849644193, + "compression/movement_sparsity/linear_layer_sparsity": 0.5644473715891157, + "compression/movement_sparsity/model_sparsity": 0.5450568679318094, + "compression_loss": 64.26970672607422, + "distillation_loss": 2.159693956375122, + "epoch": 2.79, + "learning_rate": 4.004884004884005e-05, + "loss": 66.7084, + "step": 3302, + "task_loss": 1.0949251651763916 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6005206379681363, + "compression/movement_sparsity/importance_threshold": -0.002797847517570581, + "compression/movement_sparsity/linear_layer_sparsity": 0.5650847302734202, + "compression/movement_sparsity/model_sparsity": 0.5456723313845634, + "compression_loss": 64.31838989257812, + "distillation_loss": 1.7270528078079224, + "epoch": 2.79, + "learning_rate": 4.004414389029774e-05, + "loss": 66.5947, + "step": 3303, + "task_loss": 2.5444464683532715 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6009785814031265, + "compression/movement_sparsity/importance_threshold": -0.0027946402031895365, + "compression/movement_sparsity/linear_layer_sparsity": 0.5658131061292893, + "compression/movement_sparsity/model_sparsity": 0.5463756852890371, + "compression_loss": 64.36705017089844, + "distillation_loss": 2.8622612953186035, + "epoch": 2.79, + "learning_rate": 4.0039447731755426e-05, + "loss": 66.7382, + "step": 3304, + "task_loss": 1.72420334815979 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6014361747284744, + "compression/movement_sparsity/importance_threshold": -0.0027914353408835853, + "compression/movement_sparsity/linear_layer_sparsity": 0.5663598530637288, + "compression/movement_sparsity/model_sparsity": 0.5469036497842884, + "compression_loss": 64.41565704345703, + "distillation_loss": 2.7856898307800293, + "epoch": 2.79, + "learning_rate": 4.003475157321311e-05, + "loss": 66.4833, + "step": 3305, + "task_loss": 2.3755033016204834 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6018934180780646, + "compression/movement_sparsity/importance_threshold": -0.002788232929715034, + "compression/movement_sparsity/linear_layer_sparsity": 0.5669857406987676, + "compression/movement_sparsity/model_sparsity": 0.5475080362536081, + "compression_loss": 64.4642333984375, + "distillation_loss": 2.2971198558807373, + "epoch": 2.79, + "learning_rate": 4.0030055414670805e-05, + "loss": 66.1085, + "step": 3306, + "task_loss": 1.1109089851379395 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6023503115857822, + "compression/movement_sparsity/importance_threshold": -0.0027850329687461872, + "compression/movement_sparsity/linear_layer_sparsity": 0.5676101616611873, + "compression/movement_sparsity/model_sparsity": 0.548111006435025, + "compression_loss": 64.51274108886719, + "distillation_loss": 3.0211799144744873, + "epoch": 2.79, + "learning_rate": 4.0025359256128485e-05, + "loss": 66.9633, + "step": 3307, + "task_loss": 1.5738840103149414 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6028068553855128, + "compression/movement_sparsity/importance_threshold": -0.0027818354570393455, + "compression/movement_sparsity/linear_layer_sparsity": 0.5682271300188345, + "compression/movement_sparsity/model_sparsity": 0.5487067800315704, + "compression_loss": 64.56127166748047, + "distillation_loss": 2.280118227005005, + "epoch": 2.8, + "learning_rate": 4.002066309758618e-05, + "loss": 66.6821, + "step": 3308, + "task_loss": 1.9887398481369019 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6032630496111409, + "compression/movement_sparsity/importance_threshold": -0.002778640393656818, + "compression/movement_sparsity/linear_layer_sparsity": 0.5688846524706113, + "compression/movement_sparsity/model_sparsity": 0.5493417145643529, + "compression_loss": 64.60972595214844, + "distillation_loss": 2.3958847522735596, + "epoch": 2.8, + "learning_rate": 4.0015966939043864e-05, + "loss": 67.1087, + "step": 3309, + "task_loss": 1.8467392921447754 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6037188943965524, + "compression/movement_sparsity/importance_threshold": -0.002775447777660902, + "compression/movement_sparsity/linear_layer_sparsity": 0.5695057108177575, + "compression/movement_sparsity/model_sparsity": 0.5499414376466758, + "compression_loss": 64.65817260742188, + "distillation_loss": 1.6658375263214111, + "epoch": 2.8, + "learning_rate": 4.001127078050155e-05, + "loss": 66.5614, + "step": 3310, + "task_loss": 2.180555820465088 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6041743898756315, + "compression/movement_sparsity/importance_threshold": -0.0027722576081139092, + "compression/movement_sparsity/linear_layer_sparsity": 0.5701782457965878, + "compression/movement_sparsity/model_sparsity": 0.5505908689800236, + "compression_loss": 64.70655059814453, + "distillation_loss": 2.74625825881958, + "epoch": 2.8, + "learning_rate": 4.0006574621959244e-05, + "loss": 66.9444, + "step": 3311, + "task_loss": 2.65535831451416 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6046295361822644, + "compression/movement_sparsity/importance_threshold": -0.002769069884078136, + "compression/movement_sparsity/linear_layer_sparsity": 0.5707021579500048, + "compression/movement_sparsity/model_sparsity": 0.5510967831392284, + "compression_loss": 64.75489044189453, + "distillation_loss": 1.6281743049621582, + "epoch": 2.8, + "learning_rate": 4.000187846341692e-05, + "loss": 66.647, + "step": 3312, + "task_loss": 0.9411493539810181 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6050843334503353, + "compression/movement_sparsity/importance_threshold": -0.0027658846046158935, + "compression/movement_sparsity/linear_layer_sparsity": 0.5714760006570694, + "compression/movement_sparsity/model_sparsity": 0.5518440419686867, + "compression_loss": 64.80323028564453, + "distillation_loss": 2.453622579574585, + "epoch": 2.8, + "learning_rate": 3.9997182304874616e-05, + "loss": 67.3421, + "step": 3313, + "task_loss": 1.5507417917251587 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6055387818137299, + "compression/movement_sparsity/importance_threshold": -0.00276270176878948, + "compression/movement_sparsity/linear_layer_sparsity": 0.5720083551211724, + "compression/movement_sparsity/model_sparsity": 0.5523581084192339, + "compression_loss": 64.85147857666016, + "distillation_loss": 1.681124210357666, + "epoch": 2.8, + "learning_rate": 3.99924861463323e-05, + "loss": 66.9733, + "step": 3314, + "task_loss": 1.5357396602630615 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6059928814063333, + "compression/movement_sparsity/importance_threshold": -0.002759521375661202, + "compression/movement_sparsity/linear_layer_sparsity": 0.5726030610578434, + "compression/movement_sparsity/model_sparsity": 0.552932384377451, + "compression_loss": 64.89973449707031, + "distillation_loss": 0.991226077079773, + "epoch": 2.8, + "learning_rate": 3.998778998778999e-05, + "loss": 66.6648, + "step": 3315, + "task_loss": 0.5043244957923889 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6064466323620304, + "compression/movement_sparsity/importance_threshold": -0.002756343424293363, + "compression/movement_sparsity/linear_layer_sparsity": 0.5731223824067205, + "compression/movement_sparsity/model_sparsity": 0.5534338654403748, + "compression_loss": 64.9478988647461, + "distillation_loss": 2.238922595977783, + "epoch": 2.8, + "learning_rate": 3.9983093829247675e-05, + "loss": 67.4722, + "step": 3316, + "task_loss": 1.810430884361267 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6069000348147066, + "compression/movement_sparsity/importance_threshold": -0.002753167913748267, + "compression/movement_sparsity/linear_layer_sparsity": 0.5737382179684424, + "compression/movement_sparsity/model_sparsity": 0.5540285451560196, + "compression_loss": 64.99605560302734, + "distillation_loss": 1.8672842979431152, + "epoch": 2.8, + "learning_rate": 3.997839767070536e-05, + "loss": 67.0269, + "step": 3317, + "task_loss": 0.9731149077415466 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6073530888982466, + "compression/movement_sparsity/importance_threshold": -0.0027499948430882203, + "compression/movement_sparsity/linear_layer_sparsity": 0.5743560925628298, + "compression/movement_sparsity/model_sparsity": 0.5546251938572854, + "compression_loss": 65.044189453125, + "distillation_loss": 1.3677279949188232, + "epoch": 2.8, + "learning_rate": 3.9973701512163055e-05, + "loss": 67.1439, + "step": 3318, + "task_loss": 1.156369924545288 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6078057947465363, + "compression/movement_sparsity/importance_threshold": -0.0027468242113755214, + "compression/movement_sparsity/linear_layer_sparsity": 0.5750406947993076, + "compression/movement_sparsity/model_sparsity": 0.5552862779008573, + "compression_loss": 65.09223937988281, + "distillation_loss": 2.0157811641693115, + "epoch": 2.81, + "learning_rate": 3.996900535362074e-05, + "loss": 66.978, + "step": 3319, + "task_loss": 1.4118953943252563 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6082581524934598, + "compression/movement_sparsity/importance_threshold": -0.0027436560176724812, + "compression/movement_sparsity/linear_layer_sparsity": 0.5757692614418589, + "compression/movement_sparsity/model_sparsity": 0.5559898160379038, + "compression_loss": 65.1402816772461, + "distillation_loss": 1.8784645795822144, + "epoch": 2.81, + "learning_rate": 3.996430919507843e-05, + "loss": 67.342, + "step": 3320, + "task_loss": 0.907711386680603 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6087101622729034, + "compression/movement_sparsity/importance_threshold": -0.002740490261041396, + "compression/movement_sparsity/linear_layer_sparsity": 0.5762231071862475, + "compression/movement_sparsity/model_sparsity": 0.5564280707847797, + "compression_loss": 65.18828582763672, + "distillation_loss": 2.22189998626709, + "epoch": 2.81, + "learning_rate": 3.9959613036536114e-05, + "loss": 67.7007, + "step": 3321, + "task_loss": 0.9296095967292786 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6091618242187513, + "compression/movement_sparsity/importance_threshold": -0.002737326940544577, + "compression/movement_sparsity/linear_layer_sparsity": 0.5767483548464396, + "compression/movement_sparsity/model_sparsity": 0.5569352745719934, + "compression_loss": 65.23625183105469, + "distillation_loss": 2.2014453411102295, + "epoch": 2.81, + "learning_rate": 3.99549168779938e-05, + "loss": 67.5935, + "step": 3322, + "task_loss": 1.5445044040679932 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6096131384648894, + "compression/movement_sparsity/importance_threshold": -0.002734166055244322, + "compression/movement_sparsity/linear_layer_sparsity": 0.5772872914510777, + "compression/movement_sparsity/model_sparsity": 0.5574556970462994, + "compression_loss": 65.2842025756836, + "distillation_loss": 2.454529285430908, + "epoch": 2.81, + "learning_rate": 3.995022071945149e-05, + "loss": 67.1978, + "step": 3323, + "task_loss": 0.5902627110481262 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.610064105145202, + "compression/movement_sparsity/importance_threshold": -0.002731007604202941, + "compression/movement_sparsity/linear_layer_sparsity": 0.5778907139542906, + "compression/movement_sparsity/model_sparsity": 0.5580383901301821, + "compression_loss": 65.33211517333984, + "distillation_loss": 2.196749687194824, + "epoch": 2.81, + "learning_rate": 3.994552456090917e-05, + "loss": 67.3193, + "step": 3324, + "task_loss": 1.0639160871505737 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.610514724393575, + "compression/movement_sparsity/importance_threshold": -0.002727851586482734, + "compression/movement_sparsity/linear_layer_sparsity": 0.5783594887565593, + "compression/movement_sparsity/model_sparsity": 0.5584910610758729, + "compression_loss": 65.37999725341797, + "distillation_loss": 1.3598170280456543, + "epoch": 2.81, + "learning_rate": 3.9940828402366866e-05, + "loss": 67.58, + "step": 3325, + "task_loss": 1.3490220308303833 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6109649963438932, + "compression/movement_sparsity/importance_threshold": -0.0027246980011460045, + "compression/movement_sparsity/linear_layer_sparsity": 0.579070061830148, + "compression/movement_sparsity/model_sparsity": 0.5591772237784053, + "compression_loss": 65.42781066894531, + "distillation_loss": 1.074196696281433, + "epoch": 2.81, + "learning_rate": 3.993613224382455e-05, + "loss": 67.5323, + "step": 3326, + "task_loss": 0.4648706912994385 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6114149211300417, + "compression/movement_sparsity/importance_threshold": -0.0027215468472550595, + "compression/movement_sparsity/linear_layer_sparsity": 0.5796052184736455, + "compression/movement_sparsity/model_sparsity": 0.5596939961448644, + "compression_loss": 65.4756088256836, + "distillation_loss": 1.6453258991241455, + "epoch": 2.81, + "learning_rate": 3.9931436085282246e-05, + "loss": 67.6557, + "step": 3327, + "task_loss": 0.37027639150619507 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6118644988859057, + "compression/movement_sparsity/importance_threshold": -0.002718398123872202, + "compression/movement_sparsity/linear_layer_sparsity": 0.5802011645237507, + "compression/movement_sparsity/model_sparsity": 0.5602694696148038, + "compression_loss": 65.52339172363281, + "distillation_loss": 2.628255844116211, + "epoch": 2.81, + "learning_rate": 3.992673992673993e-05, + "loss": 67.8869, + "step": 3328, + "task_loss": 2.39514422416687 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6123137297453702, + "compression/movement_sparsity/importance_threshold": -0.0027152518300597346, + "compression/movement_sparsity/linear_layer_sparsity": 0.5808247627186034, + "compression/movement_sparsity/model_sparsity": 0.560871645293251, + "compression_loss": 65.57113647460938, + "distillation_loss": 1.8479111194610596, + "epoch": 2.81, + "learning_rate": 3.992204376819761e-05, + "loss": 67.3738, + "step": 3329, + "task_loss": 2.0053887367248535 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.612762613842321, + "compression/movement_sparsity/importance_threshold": -0.00271210796487996, + "compression/movement_sparsity/linear_layer_sparsity": 0.5813061294418955, + "compression/movement_sparsity/model_sparsity": 0.5613364755887408, + "compression_loss": 65.61878967285156, + "distillation_loss": 2.45310115814209, + "epoch": 2.81, + "learning_rate": 3.9917347609655305e-05, + "loss": 68.7122, + "step": 3330, + "task_loss": 0.6467757225036621 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6132111513106422, + "compression/movement_sparsity/importance_threshold": -0.0027089665273951876, + "compression/movement_sparsity/linear_layer_sparsity": 0.5819125449111849, + "compression/movement_sparsity/model_sparsity": 0.561922058821108, + "compression_loss": 65.66645812988281, + "distillation_loss": 2.1474649906158447, + "epoch": 2.82, + "learning_rate": 3.991265145111299e-05, + "loss": 67.877, + "step": 3331, + "task_loss": 1.3735401630401611 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6136593422842198, + "compression/movement_sparsity/importance_threshold": -0.0027058275166677156, + "compression/movement_sparsity/linear_layer_sparsity": 0.5825284639420802, + "compression/movement_sparsity/model_sparsity": 0.5625168191385034, + "compression_loss": 65.71409606933594, + "distillation_loss": 2.8586883544921875, + "epoch": 2.82, + "learning_rate": 3.9907955292570684e-05, + "loss": 67.9868, + "step": 3332, + "task_loss": 2.0969676971435547 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6141071868969384, + "compression/movement_sparsity/importance_threshold": -0.002702690931759852, + "compression/movement_sparsity/linear_layer_sparsity": 0.5830267152867448, + "compression/movement_sparsity/model_sparsity": 0.5629979540166783, + "compression_loss": 65.76168060302734, + "distillation_loss": 2.353996753692627, + "epoch": 2.82, + "learning_rate": 3.9903259134028364e-05, + "loss": 67.9128, + "step": 3333, + "task_loss": 2.0860259532928467 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6145546852826838, + "compression/movement_sparsity/importance_threshold": -0.002699556771733897, + "compression/movement_sparsity/linear_layer_sparsity": 0.5836051328104251, + "compression/movement_sparsity/model_sparsity": 0.5635565011189999, + "compression_loss": 65.80924987792969, + "distillation_loss": 2.699641704559326, + "epoch": 2.82, + "learning_rate": 3.989856297548606e-05, + "loss": 68.3635, + "step": 3334, + "task_loss": 2.0088369846343994 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6150018375753401, + "compression/movement_sparsity/importance_threshold": -0.0026964250356521597, + "compression/movement_sparsity/linear_layer_sparsity": 0.5841153321710608, + "compression/movement_sparsity/model_sparsity": 0.5640491735620409, + "compression_loss": 65.85678100585938, + "distillation_loss": 1.855750322341919, + "epoch": 2.82, + "learning_rate": 3.989386681694374e-05, + "loss": 68.0974, + "step": 3335, + "task_loss": 2.192744493484497 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6154486439087934, + "compression/movement_sparsity/importance_threshold": -0.0026932957225769385, + "compression/movement_sparsity/linear_layer_sparsity": 0.5846438112806823, + "compression/movement_sparsity/model_sparsity": 0.5645594977884549, + "compression_loss": 65.90428924560547, + "distillation_loss": 2.667698621749878, + "epoch": 2.82, + "learning_rate": 3.988917065840143e-05, + "loss": 68.6174, + "step": 3336, + "task_loss": 1.8491542339324951 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6158951044169283, + "compression/movement_sparsity/importance_threshold": -0.002690168831570542, + "compression/movement_sparsity/linear_layer_sparsity": 0.5851092353918453, + "compression/movement_sparsity/model_sparsity": 0.5650089331495874, + "compression_loss": 65.95169830322266, + "distillation_loss": 2.902984380722046, + "epoch": 2.82, + "learning_rate": 3.9884474499859116e-05, + "loss": 68.2922, + "step": 3337, + "task_loss": 1.2953808307647705 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6163412192336302, + "compression/movement_sparsity/importance_threshold": -0.0026870443616952714, + "compression/movement_sparsity/linear_layer_sparsity": 0.585718393419691, + "compression/movement_sparsity/model_sparsity": 0.5655971647251874, + "compression_loss": 65.99913024902344, + "distillation_loss": 3.1647300720214844, + "epoch": 2.82, + "learning_rate": 3.98797783413168e-05, + "loss": 68.3575, + "step": 3338, + "task_loss": 1.9030823707580566 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6167869884927843, + "compression/movement_sparsity/importance_threshold": -0.00268392231201343, + "compression/movement_sparsity/linear_layer_sparsity": 0.5861663486252675, + "compression/movement_sparsity/model_sparsity": 0.5660297312913809, + "compression_loss": 66.04651641845703, + "distillation_loss": 3.1306753158569336, + "epoch": 2.82, + "learning_rate": 3.9875082182774495e-05, + "loss": 68.1845, + "step": 3339, + "task_loss": 1.6388765573501587 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6172324123282753, + "compression/movement_sparsity/importance_threshold": -0.002680802681587326, + "compression/movement_sparsity/linear_layer_sparsity": 0.586757894657515, + "compression/movement_sparsity/model_sparsity": 0.5666009558976123, + "compression_loss": 66.09388732910156, + "distillation_loss": 1.6145071983337402, + "epoch": 2.82, + "learning_rate": 3.987038602423218e-05, + "loss": 67.8443, + "step": 3340, + "task_loss": 1.1877573728561401 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.617677490873989, + "compression/movement_sparsity/importance_threshold": -0.0026776854694792584, + "compression/movement_sparsity/linear_layer_sparsity": 0.5872950307128401, + "compression/movement_sparsity/model_sparsity": 0.5671196396770132, + "compression_loss": 66.14115142822266, + "distillation_loss": 1.7615495920181274, + "epoch": 2.82, + "learning_rate": 3.986568986568987e-05, + "loss": 68.2722, + "step": 3341, + "task_loss": 0.5915787220001221 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6181222242638098, + "compression/movement_sparsity/importance_threshold": -0.002674570674751536, + "compression/movement_sparsity/linear_layer_sparsity": 0.587884633105763, + "compression/movement_sparsity/model_sparsity": 0.5676889874139099, + "compression_loss": 66.18843078613281, + "distillation_loss": 2.699761390686035, + "epoch": 2.82, + "learning_rate": 3.9860993707147554e-05, + "loss": 68.3208, + "step": 3342, + "task_loss": 1.716234564781189 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6185666126316237, + "compression/movement_sparsity/importance_threshold": -0.0026714582964664562, + "compression/movement_sparsity/linear_layer_sparsity": 0.5884588652466032, + "compression/movement_sparsity/model_sparsity": 0.5682434929141676, + "compression_loss": 66.23563385009766, + "distillation_loss": 2.8895297050476074, + "epoch": 2.83, + "learning_rate": 3.985629754860524e-05, + "loss": 68.0224, + "step": 3343, + "task_loss": 2.2739460468292236 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6190106561113149, + "compression/movement_sparsity/importance_threshold": -0.0026683483336863304, + "compression/movement_sparsity/linear_layer_sparsity": 0.5890808179063222, + "compression/movement_sparsity/model_sparsity": 0.5688440795866752, + "compression_loss": 66.2828369140625, + "distillation_loss": 1.3110913038253784, + "epoch": 2.83, + "learning_rate": 3.9851601390062934e-05, + "loss": 68.0116, + "step": 3344, + "task_loss": 0.5842018723487854 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6194543548367692, + "compression/movement_sparsity/importance_threshold": -0.002665240785473458, + "compression/movement_sparsity/linear_layer_sparsity": 0.5897905324398411, + "compression/movement_sparsity/model_sparsity": 0.5695294132426304, + "compression_loss": 66.32998657226562, + "distillation_loss": 1.4897422790527344, + "epoch": 2.83, + "learning_rate": 3.984690523152062e-05, + "loss": 68.5351, + "step": 3345, + "task_loss": 0.924243152141571 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6198977089418716, + "compression/movement_sparsity/importance_threshold": -0.0026621356508901442, + "compression/movement_sparsity/linear_layer_sparsity": 0.5903972460133214, + "compression/movement_sparsity/model_sparsity": 0.5701152843383924, + "compression_loss": 66.3770523071289, + "distillation_loss": 1.7013236284255981, + "epoch": 2.83, + "learning_rate": 3.9842209072978306e-05, + "loss": 68.649, + "step": 3346, + "task_loss": 0.927203357219696 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6203407185605072, + "compression/movement_sparsity/importance_threshold": -0.0026590329289986918, + "compression/movement_sparsity/linear_layer_sparsity": 0.5908869835022941, + "compression/movement_sparsity/model_sparsity": 0.5705881978380101, + "compression_loss": 66.42414093017578, + "distillation_loss": 2.592482089996338, + "epoch": 2.83, + "learning_rate": 3.983751291443599e-05, + "loss": 68.4909, + "step": 3347, + "task_loss": 2.7591958045959473 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6207833838265611, + "compression/movement_sparsity/importance_threshold": -0.002655932618861407, + "compression/movement_sparsity/linear_layer_sparsity": 0.5915580518085051, + "compression/movement_sparsity/model_sparsity": 0.5712362128834552, + "compression_loss": 66.47114562988281, + "distillation_loss": 3.3673043251037598, + "epoch": 2.83, + "learning_rate": 3.983281675589368e-05, + "loss": 68.8837, + "step": 3348, + "task_loss": 2.150573968887329 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6212257048739183, + "compression/movement_sparsity/importance_threshold": -0.0026528347195405935, + "compression/movement_sparsity/linear_layer_sparsity": 0.5921689507648257, + "compression/movement_sparsity/model_sparsity": 0.5718261255812811, + "compression_loss": 66.51812744140625, + "distillation_loss": 2.211599111557007, + "epoch": 2.83, + "learning_rate": 3.982812059735137e-05, + "loss": 68.9843, + "step": 3349, + "task_loss": 1.7108513116836548 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6216676818364644, + "compression/movement_sparsity/importance_threshold": -0.002649739230098552, + "compression/movement_sparsity/linear_layer_sparsity": 0.5927960069682928, + "compression/movement_sparsity/model_sparsity": 0.5724316404751086, + "compression_loss": 66.56510162353516, + "distillation_loss": 1.3534952402114868, + "epoch": 2.83, + "learning_rate": 3.982342443880905e-05, + "loss": 69.0795, + "step": 3350, + "task_loss": 1.6996979713439941 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.622109314848084, + "compression/movement_sparsity/importance_threshold": -0.002646646149597591, + "compression/movement_sparsity/linear_layer_sparsity": 0.5933821990492718, + "compression/movement_sparsity/model_sparsity": 0.5729976950547683, + "compression_loss": 66.61201477050781, + "distillation_loss": 1.6764590740203857, + "epoch": 2.83, + "learning_rate": 3.9818728280266745e-05, + "loss": 69.0615, + "step": 3351, + "task_loss": 2.2385294437408447 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6225506040426627, + "compression/movement_sparsity/importance_threshold": -0.002643555477100011, + "compression/movement_sparsity/linear_layer_sparsity": 0.5939691662011471, + "compression/movement_sparsity/model_sparsity": 0.5735644980792546, + "compression_loss": 66.65890502929688, + "distillation_loss": 1.744621992111206, + "epoch": 2.83, + "learning_rate": 3.981403212172443e-05, + "loss": 68.6025, + "step": 3352, + "task_loss": 0.7098692655563354 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6229915495540852, + "compression/movement_sparsity/importance_threshold": -0.0026404672116681184, + "compression/movement_sparsity/linear_layer_sparsity": 0.594566268895513, + "compression/movement_sparsity/model_sparsity": 0.5741410884591661, + "compression_loss": 66.70573425292969, + "distillation_loss": 3.49221134185791, + "epoch": 2.83, + "learning_rate": 3.980933596318212e-05, + "loss": 69.4774, + "step": 3353, + "task_loss": 2.386349678039551 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6234321515162371, + "compression/movement_sparsity/importance_threshold": -0.0026373813523642153, + "compression/movement_sparsity/linear_layer_sparsity": 0.5950856856377312, + "compression/movement_sparsity/model_sparsity": 0.5746426616383764, + "compression_loss": 66.7525863647461, + "distillation_loss": 1.6641709804534912, + "epoch": 2.83, + "learning_rate": 3.9804639804639804e-05, + "loss": 69.1548, + "step": 3354, + "task_loss": 2.0368950366973877 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6238724100630031, + "compression/movement_sparsity/importance_threshold": -0.0026342978982506074, + "compression/movement_sparsity/linear_layer_sparsity": 0.5956902767093724, + "compression/movement_sparsity/model_sparsity": 0.5752264831467669, + "compression_loss": 66.79933166503906, + "distillation_loss": 1.8775321245193481, + "epoch": 2.84, + "learning_rate": 3.979994364609749e-05, + "loss": 69.1763, + "step": 3355, + "task_loss": 1.2580708265304565 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6243123253282686, + "compression/movement_sparsity/importance_threshold": -0.0026312168483895977, + "compression/movement_sparsity/linear_layer_sparsity": 0.5962858650344485, + "compression/movement_sparsity/model_sparsity": 0.5758016111806327, + "compression_loss": 66.8460693359375, + "distillation_loss": 2.5882701873779297, + "epoch": 2.84, + "learning_rate": 3.979524748755518e-05, + "loss": 69.5024, + "step": 3356, + "task_loss": 1.6682413816452026 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6247518974459187, + "compression/movement_sparsity/importance_threshold": -0.00262813820184349, + "compression/movement_sparsity/linear_layer_sparsity": 0.5970708687624202, + "compression/movement_sparsity/model_sparsity": 0.5765596476155946, + "compression_loss": 66.89278411865234, + "distillation_loss": 3.0579864978790283, + "epoch": 2.84, + "learning_rate": 3.979055132901287e-05, + "loss": 69.0322, + "step": 3357, + "task_loss": 1.3409351110458374 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6251911265498385, + "compression/movement_sparsity/importance_threshold": -0.0026250619576745895, + "compression/movement_sparsity/linear_layer_sparsity": 0.5975897012204242, + "compression/movement_sparsity/model_sparsity": 0.5770606565825509, + "compression_loss": 66.9394760131836, + "distillation_loss": 3.6637699604034424, + "epoch": 2.84, + "learning_rate": 3.978585517047056e-05, + "loss": 69.2853, + "step": 3358, + "task_loss": 2.6948177814483643 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6256300127739135, + "compression/movement_sparsity/importance_threshold": -0.002621988114945197, + "compression/movement_sparsity/linear_layer_sparsity": 0.5982219205884777, + "compression/movement_sparsity/model_sparsity": 0.5776711572703775, + "compression_loss": 66.9861068725586, + "distillation_loss": 1.2880866527557373, + "epoch": 2.84, + "learning_rate": 3.978115901192824e-05, + "loss": 69.891, + "step": 3359, + "task_loss": 1.3387134075164795 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6260685562520281, + "compression/movement_sparsity/importance_threshold": -0.002618916672717621, + "compression/movement_sparsity/linear_layer_sparsity": 0.5987688702337671, + "compression/movement_sparsity/model_sparsity": 0.5781993175127373, + "compression_loss": 67.03270721435547, + "distillation_loss": 2.382185220718384, + "epoch": 2.84, + "learning_rate": 3.977646285338593e-05, + "loss": 69.3353, + "step": 3360, + "task_loss": 1.646415114402771 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6265067571180682, + "compression/movement_sparsity/importance_threshold": -0.002615847630054161, + "compression/movement_sparsity/linear_layer_sparsity": 0.5992763151116789, + "compression/movement_sparsity/model_sparsity": 0.5786893300980098, + "compression_loss": 67.07929229736328, + "distillation_loss": 2.7674007415771484, + "epoch": 2.84, + "learning_rate": 3.977176669484362e-05, + "loss": 69.8185, + "step": 3361, + "task_loss": 1.3305968046188354 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6269446155059183, + "compression/movement_sparsity/importance_threshold": -0.002612780986017125, + "compression/movement_sparsity/linear_layer_sparsity": 0.5997412264836336, + "compression/movement_sparsity/model_sparsity": 0.5791382703341031, + "compression_loss": 67.12576293945312, + "distillation_loss": 1.6906708478927612, + "epoch": 2.84, + "learning_rate": 3.976707053630131e-05, + "loss": 68.8935, + "step": 3362, + "task_loss": 0.7392365336418152 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6273821315494641, + "compression/movement_sparsity/importance_threshold": -0.002609716739668814, + "compression/movement_sparsity/linear_layer_sparsity": 0.6003295768389547, + "compression/movement_sparsity/model_sparsity": 0.5797064090447416, + "compression_loss": 67.1722412109375, + "distillation_loss": 1.6026190519332886, + "epoch": 2.84, + "learning_rate": 3.9762374377758994e-05, + "loss": 70.0181, + "step": 3363, + "task_loss": 1.0470930337905884 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6278193053825902, + "compression/movement_sparsity/importance_threshold": -0.0026066548900715346, + "compression/movement_sparsity/linear_layer_sparsity": 0.600883895619843, + "compression/movement_sparsity/model_sparsity": 0.5802416852702224, + "compression_loss": 67.21868133544922, + "distillation_loss": 3.251743793487549, + "epoch": 2.84, + "learning_rate": 3.975767821921668e-05, + "loss": 69.7877, + "step": 3364, + "task_loss": 2.536362409591675 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6282561371391822, + "compression/movement_sparsity/importance_threshold": -0.0026035954362875887, + "compression/movement_sparsity/linear_layer_sparsity": 0.6015237464551835, + "compression/movement_sparsity/model_sparsity": 0.5808595552609574, + "compression_loss": 67.26506805419922, + "distillation_loss": 2.310790777206421, + "epoch": 2.84, + "learning_rate": 3.9752982060674374e-05, + "loss": 69.6227, + "step": 3365, + "task_loss": 2.104435920715332 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.628692626953125, + "compression/movement_sparsity/importance_threshold": -0.00260053837737928, + "compression/movement_sparsity/linear_layer_sparsity": 0.6020070091211297, + "compression/movement_sparsity/model_sparsity": 0.5813262163676386, + "compression_loss": 67.3114013671875, + "distillation_loss": 2.838500499725342, + "epoch": 2.84, + "learning_rate": 3.974828590213206e-05, + "loss": 69.7047, + "step": 3366, + "task_loss": 2.7425289154052734 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6291287749583039, + "compression/movement_sparsity/importance_threshold": -0.0025974837124089134, + "compression/movement_sparsity/linear_layer_sparsity": 0.6024330834790944, + "compression/movement_sparsity/model_sparsity": 0.5817376537606491, + "compression_loss": 67.35771942138672, + "distillation_loss": 1.9508877992630005, + "epoch": 2.85, + "learning_rate": 3.974358974358974e-05, + "loss": 70.2413, + "step": 3367, + "task_loss": 1.194808006286621 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6295645812886038, + "compression/movement_sparsity/importance_threshold": -0.0025944314404387934, + "compression/movement_sparsity/linear_layer_sparsity": 0.6029597739635705, + "compression/movement_sparsity/model_sparsity": 0.582246250806694, + "compression_loss": 67.40400695800781, + "distillation_loss": 2.17785906791687, + "epoch": 2.85, + "learning_rate": 3.973889358504743e-05, + "loss": 70.2992, + "step": 3368, + "task_loss": 1.6217352151870728 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6300000460779098, + "compression/movement_sparsity/importance_threshold": -0.002591381560531225, + "compression/movement_sparsity/linear_layer_sparsity": 0.6035609070265971, + "compression/movement_sparsity/model_sparsity": 0.5828267330997041, + "compression_loss": 67.45024871826172, + "distillation_loss": 1.50167977809906, + "epoch": 2.85, + "learning_rate": 3.973419742650512e-05, + "loss": 69.5914, + "step": 3369, + "task_loss": 1.1355386972427368 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6304351694601077, + "compression/movement_sparsity/importance_threshold": -0.0025883340717485076, + "compression/movement_sparsity/linear_layer_sparsity": 0.6039875775929436, + "compression/movement_sparsity/model_sparsity": 0.5832387462195044, + "compression_loss": 67.4964370727539, + "distillation_loss": 1.582158088684082, + "epoch": 2.85, + "learning_rate": 3.972950126796281e-05, + "loss": 69.5025, + "step": 3370, + "task_loss": 1.1491085290908813 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6308699515690818, + "compression/movement_sparsity/importance_threshold": -0.0025852889731529513, + "compression/movement_sparsity/linear_layer_sparsity": 0.6045432557289425, + "compression/movement_sparsity/model_sparsity": 0.5837753351020658, + "compression_loss": 67.54263305664062, + "distillation_loss": 2.3507776260375977, + "epoch": 2.85, + "learning_rate": 3.97248051094205e-05, + "loss": 69.8696, + "step": 3371, + "task_loss": 2.7397546768188477 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6313043925387178, + "compression/movement_sparsity/importance_threshold": -0.002582246263806854, + "compression/movement_sparsity/linear_layer_sparsity": 0.605103989712019, + "compression/movement_sparsity/model_sparsity": 0.5843168061478041, + "compression_loss": 67.5887680053711, + "distillation_loss": 2.569270610809326, + "epoch": 2.85, + "learning_rate": 3.9720108950878185e-05, + "loss": 69.8661, + "step": 3372, + "task_loss": 1.4933189153671265 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6317384925029005, + "compression/movement_sparsity/importance_threshold": -0.0025792059427725243, + "compression/movement_sparsity/linear_layer_sparsity": 0.6056836115766306, + "compression/movement_sparsity/model_sparsity": 0.584876516218241, + "compression_loss": 67.63484954833984, + "distillation_loss": 2.101522207260132, + "epoch": 2.85, + "learning_rate": 3.971541279233587e-05, + "loss": 70.0835, + "step": 3373, + "task_loss": 1.3961799144744873 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6321722515955155, + "compression/movement_sparsity/importance_threshold": -0.0025761680091122624, + "compression/movement_sparsity/linear_layer_sparsity": 0.6061813621062544, + "compression/movement_sparsity/model_sparsity": 0.5853571674859125, + "compression_loss": 67.6809310913086, + "distillation_loss": 1.8312392234802246, + "epoch": 2.85, + "learning_rate": 3.971071663379356e-05, + "loss": 70.3236, + "step": 3374, + "task_loss": 1.2809613943099976 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6326056699504472, + "compression/movement_sparsity/importance_threshold": -0.002573132461888378, + "compression/movement_sparsity/linear_layer_sparsity": 0.6067118802485416, + "compression/movement_sparsity/model_sparsity": 0.5858694606979473, + "compression_loss": 67.72693634033203, + "distillation_loss": 4.660986423492432, + "epoch": 2.85, + "learning_rate": 3.970602047525125e-05, + "loss": 70.2048, + "step": 3375, + "task_loss": 2.9179062843322754 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6330387477015813, + "compression/movement_sparsity/importance_threshold": -0.0025700993001631697, + "compression/movement_sparsity/linear_layer_sparsity": 0.6072962240835371, + "compression/movement_sparsity/model_sparsity": 0.5864337305245588, + "compression_loss": 67.77293395996094, + "distillation_loss": 1.908196210861206, + "epoch": 2.85, + "learning_rate": 3.970132431670893e-05, + "loss": 70.0975, + "step": 3376, + "task_loss": 1.3825643062591553 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6334714849828029, + "compression/movement_sparsity/importance_threshold": -0.0025670685229989424, + "compression/movement_sparsity/linear_layer_sparsity": 0.6078894156509184, + "compression/movement_sparsity/model_sparsity": 0.5870065441367298, + "compression_loss": 67.81888580322266, + "distillation_loss": 2.9968273639678955, + "epoch": 2.85, + "learning_rate": 3.9696628158166623e-05, + "loss": 70.1975, + "step": 3377, + "task_loss": 1.7829724550247192 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6339038819279967, + "compression/movement_sparsity/importance_threshold": -0.0025640401294580034, + "compression/movement_sparsity/linear_layer_sparsity": 0.6082964232648332, + "compression/movement_sparsity/model_sparsity": 0.5873995697870049, + "compression_loss": 67.86475372314453, + "distillation_loss": 2.1071643829345703, + "epoch": 2.85, + "learning_rate": 3.969193199962431e-05, + "loss": 69.706, + "step": 3378, + "task_loss": 0.9504131078720093 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6343359386710488, + "compression/movement_sparsity/importance_threshold": -0.0025610141186026515, + "compression/movement_sparsity/linear_layer_sparsity": 0.608742828328617, + "compression/movement_sparsity/model_sparsity": 0.5878306394635452, + "compression_loss": 67.91063690185547, + "distillation_loss": 1.7914707660675049, + "epoch": 2.86, + "learning_rate": 3.9687235841081996e-05, + "loss": 69.7904, + "step": 3379, + "task_loss": 1.778961181640625 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6347676553458432, + "compression/movement_sparsity/importance_threshold": -0.002557990489495195, + "compression/movement_sparsity/linear_layer_sparsity": 0.6093360675926689, + "compression/movement_sparsity/model_sparsity": 0.5884034991338594, + "compression_loss": 67.95645141601562, + "distillation_loss": 1.8042007684707642, + "epoch": 2.86, + "learning_rate": 3.968253968253968e-05, + "loss": 70.1543, + "step": 3380, + "task_loss": 1.5165016651153564 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6351990320862659, + "compression/movement_sparsity/importance_threshold": -0.0025549692411979347, + "compression/movement_sparsity/linear_layer_sparsity": 0.6098753738465037, + "compression/movement_sparsity/model_sparsity": 0.5889242785587748, + "compression_loss": 68.0022201538086, + "distillation_loss": 2.283940315246582, + "epoch": 2.86, + "learning_rate": 3.967784352399737e-05, + "loss": 70.0865, + "step": 3381, + "task_loss": 2.0026798248291016 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6356300690262014, + "compression/movement_sparsity/importance_threshold": -0.0025519503727731785, + "compression/movement_sparsity/linear_layer_sparsity": 0.6104497490773556, + "compression/movement_sparsity/model_sparsity": 0.5894789222334621, + "compression_loss": 68.04796600341797, + "distillation_loss": 2.2981314659118652, + "epoch": 2.86, + "learning_rate": 3.967314736545506e-05, + "loss": 70.383, + "step": 3382, + "task_loss": 1.2954233884811401 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6360607662995356, + "compression/movement_sparsity/importance_threshold": -0.002548933883283226, + "compression/movement_sparsity/linear_layer_sparsity": 0.6110890394768171, + "compression/movement_sparsity/model_sparsity": 0.5900962510410148, + "compression_loss": 68.09371185302734, + "distillation_loss": 1.399935007095337, + "epoch": 2.86, + "learning_rate": 3.966845120691275e-05, + "loss": 70.306, + "step": 3383, + "task_loss": 0.6511766910552979 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6364911240401527, + "compression/movement_sparsity/importance_threshold": -0.002545919771790385, + "compression/movement_sparsity/linear_layer_sparsity": 0.6116268313613622, + "compression/movement_sparsity/model_sparsity": 0.5906155681198844, + "compression_loss": 68.13936614990234, + "distillation_loss": 3.2246179580688477, + "epoch": 2.86, + "learning_rate": 3.9663755048370435e-05, + "loss": 70.5414, + "step": 3384, + "task_loss": 2.5929291248321533 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6369211423819386, + "compression/movement_sparsity/importance_threshold": -0.002542908037356956, + "compression/movement_sparsity/linear_layer_sparsity": 0.612124844222674, + "compression/movement_sparsity/model_sparsity": 0.5910964727073433, + "compression_loss": 68.18500518798828, + "distillation_loss": 2.1962943077087402, + "epoch": 2.86, + "learning_rate": 3.965905888982812e-05, + "loss": 70.3342, + "step": 3385, + "task_loss": 0.9510858654975891 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6373508214587782, + "compression/movement_sparsity/importance_threshold": -0.002539898679045245, + "compression/movement_sparsity/linear_layer_sparsity": 0.6127913932693511, + "compression/movement_sparsity/model_sparsity": 0.5917401237437224, + "compression_loss": 68.23057556152344, + "distillation_loss": 2.1230571269989014, + "epoch": 2.86, + "learning_rate": 3.965436273128581e-05, + "loss": 70.2589, + "step": 3386, + "task_loss": 1.4040042161941528 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6377801614045565, + "compression/movement_sparsity/importance_threshold": -0.0025368916959175557, + "compression/movement_sparsity/linear_layer_sparsity": 0.6133472741161998, + "compression/movement_sparsity/model_sparsity": 0.5922769083733923, + "compression_loss": 68.276123046875, + "distillation_loss": 2.478807210922241, + "epoch": 2.86, + "learning_rate": 3.96496665727435e-05, + "loss": 71.107, + "step": 3387, + "task_loss": 2.228999376296997 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6382091623531588, + "compression/movement_sparsity/importance_threshold": -0.002533887087036192, + "compression/movement_sparsity/linear_layer_sparsity": 0.6138153930892485, + "compression/movement_sparsity/model_sparsity": 0.5927289460196143, + "compression_loss": 68.32160186767578, + "distillation_loss": 1.7568552494049072, + "epoch": 2.86, + "learning_rate": 3.964497041420119e-05, + "loss": 70.6454, + "step": 3388, + "task_loss": 1.4515525102615356 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6386378244384701, + "compression/movement_sparsity/importance_threshold": -0.0025308848514634594, + "compression/movement_sparsity/linear_layer_sparsity": 0.6143859764347922, + "compression/movement_sparsity/model_sparsity": 0.593279928071919, + "compression_loss": 68.36710357666016, + "distillation_loss": 2.3434557914733887, + "epoch": 2.86, + "learning_rate": 3.964027425565887e-05, + "loss": 71.0432, + "step": 3389, + "task_loss": 1.457148551940918 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6390661477943758, + "compression/movement_sparsity/importance_threshold": -0.0025278849882616585, + "compression/movement_sparsity/linear_layer_sparsity": 0.6148380931748736, + "compression/movement_sparsity/model_sparsity": 0.5937165132111047, + "compression_loss": 68.4125747680664, + "distillation_loss": 2.058634042739868, + "epoch": 2.87, + "learning_rate": 3.963557809711656e-05, + "loss": 70.675, + "step": 3390, + "task_loss": 1.6738131046295166 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6394941325547607, + "compression/movement_sparsity/importance_threshold": -0.0025248874964930966, + "compression/movement_sparsity/linear_layer_sparsity": 0.6154294484204389, + "compression/movement_sparsity/model_sparsity": 0.5942875535847635, + "compression_loss": 68.45799255371094, + "distillation_loss": 3.2280449867248535, + "epoch": 2.87, + "learning_rate": 3.9630881938574246e-05, + "loss": 70.7327, + "step": 3391, + "task_loss": 1.5819525718688965 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6399217788535103, + "compression/movement_sparsity/importance_threshold": -0.002521892375220074, + "compression/movement_sparsity/linear_layer_sparsity": 0.6160955443487458, + "compression/movement_sparsity/model_sparsity": 0.5949307670687822, + "compression_loss": 68.50337219238281, + "distillation_loss": 2.562039375305176, + "epoch": 2.87, + "learning_rate": 3.962618578003194e-05, + "loss": 71.2947, + "step": 3392, + "task_loss": 1.8190886974334717 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6403490868245094, + "compression/movement_sparsity/importance_threshold": -0.0025188996235048994, + "compression/movement_sparsity/linear_layer_sparsity": 0.6165976114270538, + "compression/movement_sparsity/model_sparsity": 0.5954155865984114, + "compression_loss": 68.5487060546875, + "distillation_loss": 1.914147138595581, + "epoch": 2.87, + "learning_rate": 3.962148962148962e-05, + "loss": 71.1094, + "step": 3393, + "task_loss": 0.9982047080993652 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6407760566016435, + "compression/movement_sparsity/importance_threshold": -0.002515909240409872, + "compression/movement_sparsity/linear_layer_sparsity": 0.6171095755644996, + "compression/movement_sparsity/model_sparsity": 0.59590996319275, + "compression_loss": 68.59396362304688, + "distillation_loss": 0.9200165271759033, + "epoch": 2.87, + "learning_rate": 3.961679346294731e-05, + "loss": 70.4365, + "step": 3394, + "task_loss": 0.3802624046802521 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6412026883187972, + "compression/movement_sparsity/importance_threshold": -0.0025129212249973007, + "compression/movement_sparsity/linear_layer_sparsity": 0.617850090223022, + "compression/movement_sparsity/model_sparsity": 0.5966250388946626, + "compression_loss": 68.63921356201172, + "distillation_loss": 1.90958833694458, + "epoch": 2.87, + "learning_rate": 3.9612097304405e-05, + "loss": 70.8727, + "step": 3395, + "task_loss": 0.8666361570358276 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6416289821098562, + "compression/movement_sparsity/importance_threshold": -0.0025099355763294855, + "compression/movement_sparsity/linear_layer_sparsity": 0.618466736628143, + "compression/movement_sparsity/model_sparsity": 0.5972205015987415, + "compression_loss": 68.68440246582031, + "distillation_loss": 2.463286876678467, + "epoch": 2.87, + "learning_rate": 3.960740114586269e-05, + "loss": 71.6596, + "step": 3396, + "task_loss": 1.5694712400436401 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6420549381087054, + "compression/movement_sparsity/importance_threshold": -0.0025069522934687317, + "compression/movement_sparsity/linear_layer_sparsity": 0.6189950368752499, + "compression/movement_sparsity/model_sparsity": 0.5977306531071186, + "compression_loss": 68.72952270507812, + "distillation_loss": 3.5738658905029297, + "epoch": 2.87, + "learning_rate": 3.960270498732037e-05, + "loss": 71.1766, + "step": 3397, + "task_loss": 1.5617579221725464 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6424805564492297, + "compression/movement_sparsity/importance_threshold": -0.002503971375477345, + "compression/movement_sparsity/linear_layer_sparsity": 0.6195369425976294, + "compression/movement_sparsity/model_sparsity": 0.5982539427008373, + "compression_loss": 68.77462005615234, + "distillation_loss": 2.5881693363189697, + "epoch": 2.87, + "learning_rate": 3.9598008828778064e-05, + "loss": 71.4496, + "step": 3398, + "task_loss": 1.954263687133789 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.642905837265315, + "compression/movement_sparsity/importance_threshold": -0.0025009928214176245, + "compression/movement_sparsity/linear_layer_sparsity": 0.6200187385909564, + "compression/movement_sparsity/model_sparsity": 0.5987191875196157, + "compression_loss": 68.81973266601562, + "distillation_loss": 2.7040669918060303, + "epoch": 2.87, + "learning_rate": 3.959331267023575e-05, + "loss": 71.0068, + "step": 3399, + "task_loss": 1.8171714544296265 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6433307806908455, + "compression/movement_sparsity/importance_threshold": -0.0024980166303518805, + "compression/movement_sparsity/linear_layer_sparsity": 0.6204427262195847, + "compression/movement_sparsity/model_sparsity": 0.5991286098688622, + "compression_loss": 68.86479187011719, + "distillation_loss": 1.901652455329895, + "epoch": 2.87, + "learning_rate": 3.9588616511693436e-05, + "loss": 71.2142, + "step": 3400, + "task_loss": 1.7284334897994995 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6437553868597072, + "compression/movement_sparsity/importance_threshold": -0.0024950428013424102, + "compression/movement_sparsity/linear_layer_sparsity": 0.6209155910113525, + "compression/movement_sparsity/model_sparsity": 0.5995852303003305, + "compression_loss": 68.90982055664062, + "distillation_loss": 2.322361469268799, + "epoch": 2.87, + "learning_rate": 3.958392035315112e-05, + "loss": 70.776, + "step": 3401, + "task_loss": 1.1826398372650146 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6441796559057844, + "compression/movement_sparsity/importance_threshold": -0.0024920713334515244, + "compression/movement_sparsity/linear_layer_sparsity": 0.6213386604790729, + "compression/movement_sparsity/model_sparsity": 0.5999937660303208, + "compression_loss": 68.9548110961914, + "distillation_loss": 2.1850244998931885, + "epoch": 2.88, + "learning_rate": 3.957922419460881e-05, + "loss": 71.2168, + "step": 3402, + "task_loss": 1.630164384841919 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.644603587962963, + "compression/movement_sparsity/importance_threshold": -0.002489102225741522, + "compression/movement_sparsity/linear_layer_sparsity": 0.6217883566131243, + "compression/movement_sparsity/model_sparsity": 0.6004280137187403, + "compression_loss": 68.99978637695312, + "distillation_loss": 2.5563950538635254, + "epoch": 2.88, + "learning_rate": 3.95745280360665e-05, + "loss": 71.7443, + "step": 3403, + "task_loss": 2.1134989261627197 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6450271831651275, + "compression/movement_sparsity/importance_threshold": -0.0024861354772747106, + "compression/movement_sparsity/linear_layer_sparsity": 0.6224237597339365, + "compression/movement_sparsity/model_sparsity": 0.601041588787624, + "compression_loss": 69.04474639892578, + "distillation_loss": 1.8564252853393555, + "epoch": 2.88, + "learning_rate": 3.956983187752419e-05, + "loss": 70.9939, + "step": 3404, + "task_loss": 1.93593168258667 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6454504416461636, + "compression/movement_sparsity/importance_threshold": -0.0024831710871133913, + "compression/movement_sparsity/linear_layer_sparsity": 0.6229165259614886, + "compression/movement_sparsity/model_sparsity": 0.6015174269793334, + "compression_loss": 69.08961486816406, + "distillation_loss": 3.3086605072021484, + "epoch": 2.88, + "learning_rate": 3.9565135718981875e-05, + "loss": 72.3267, + "step": 3405, + "task_loss": 1.7085344791412354 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6458733635399561, + "compression/movement_sparsity/importance_threshold": -0.0024802090543198695, + "compression/movement_sparsity/linear_layer_sparsity": 0.6234249724694818, + "compression/movement_sparsity/model_sparsity": 0.6020084067856126, + "compression_loss": 69.134521484375, + "distillation_loss": 1.772336483001709, + "epoch": 2.88, + "learning_rate": 3.956043956043956e-05, + "loss": 71.2518, + "step": 3406, + "task_loss": 0.9642235040664673 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6462959489803901, + "compression/movement_sparsity/importance_threshold": -0.002477249377956449, + "compression/movement_sparsity/linear_layer_sparsity": 0.6239062437994328, + "compression/movement_sparsity/model_sparsity": 0.6024731449648161, + "compression_loss": 69.17930603027344, + "distillation_loss": 3.7423996925354004, + "epoch": 2.88, + "learning_rate": 3.955574340189725e-05, + "loss": 71.8873, + "step": 3407, + "task_loss": 1.615133285522461 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.646718198101351, + "compression/movement_sparsity/importance_threshold": -0.002474292057085434, + "compression/movement_sparsity/linear_layer_sparsity": 0.6245149367847407, + "compression/movement_sparsity/model_sparsity": 0.60306092747352, + "compression_loss": 69.22412109375, + "distillation_loss": 2.753981113433838, + "epoch": 2.88, + "learning_rate": 3.955104724335494e-05, + "loss": 71.9212, + "step": 3408, + "task_loss": 1.4958302974700928 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6471401110367236, + "compression/movement_sparsity/importance_threshold": -0.002471337090769129, + "compression/movement_sparsity/linear_layer_sparsity": 0.6250927938725422, + "compression/movement_sparsity/model_sparsity": 0.6036189333926593, + "compression_loss": 69.26885223388672, + "distillation_loss": 2.8000693321228027, + "epoch": 2.88, + "learning_rate": 3.954635108481263e-05, + "loss": 72.1132, + "step": 3409, + "task_loss": 1.432848334312439 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6475616879203937, + "compression/movement_sparsity/importance_threshold": -0.0024683844780698343, + "compression/movement_sparsity/linear_layer_sparsity": 0.625718979611772, + "compression/movement_sparsity/model_sparsity": 0.6042236077253738, + "compression_loss": 69.31355285644531, + "distillation_loss": 2.859842300415039, + "epoch": 2.88, + "learning_rate": 3.954165492627031e-05, + "loss": 71.5616, + "step": 3410, + "task_loss": 1.4489312171936035 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6479829288862455, + "compression/movement_sparsity/importance_threshold": -0.0024654342180498593, + "compression/movement_sparsity/linear_layer_sparsity": 0.6262631986226728, + "compression/movement_sparsity/model_sparsity": 0.6047491311390366, + "compression_loss": 69.35826110839844, + "distillation_loss": 2.56192684173584, + "epoch": 2.88, + "learning_rate": 3.9536958767728e-05, + "loss": 72.3112, + "step": 3411, + "task_loss": 1.698162317276001 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.648403834068165, + "compression/movement_sparsity/importance_threshold": -0.0024624863097715034, + "compression/movement_sparsity/linear_layer_sparsity": 0.6266775276755161, + "compression/movement_sparsity/model_sparsity": 0.6051492267142896, + "compression_loss": 69.40292358398438, + "distillation_loss": 3.5586342811584473, + "epoch": 2.88, + "learning_rate": 3.9532262609185686e-05, + "loss": 72.0861, + "step": 3412, + "task_loss": 2.356511354446411 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6488244036000367, + "compression/movement_sparsity/importance_threshold": -0.002459540752297075, + "compression/movement_sparsity/linear_layer_sparsity": 0.6272804374395207, + "compression/movement_sparsity/model_sparsity": 0.6057314246731331, + "compression_loss": 69.4474868774414, + "distillation_loss": 2.0453691482543945, + "epoch": 2.88, + "learning_rate": 3.952756645064338e-05, + "loss": 72.2958, + "step": 3413, + "task_loss": 1.8191465139389038 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6492446376157464, + "compression/movement_sparsity/importance_threshold": -0.0024565975446888726, + "compression/movement_sparsity/linear_layer_sparsity": 0.6277815744327531, + "compression/movement_sparsity/model_sparsity": 0.6062153460689703, + "compression_loss": 69.49214172363281, + "distillation_loss": 3.582063674926758, + "epoch": 2.89, + "learning_rate": 3.952287029210106e-05, + "loss": 72.6939, + "step": 3414, + "task_loss": 3.2775070667266846 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6496645362491783, + "compression/movement_sparsity/importance_threshold": -0.0024536566860092063, + "compression/movement_sparsity/linear_layer_sparsity": 0.6282623091751605, + "compression/movement_sparsity/model_sparsity": 0.606679566094063, + "compression_loss": 69.53668212890625, + "distillation_loss": 2.9792656898498535, + "epoch": 2.89, + "learning_rate": 3.951817413355875e-05, + "loss": 72.6849, + "step": 3415, + "task_loss": 2.7068710327148438 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6500840996342183, + "compression/movement_sparsity/importance_threshold": -0.0024507181753203764, + "compression/movement_sparsity/linear_layer_sparsity": 0.6287962376293915, + "compression/movement_sparsity/model_sparsity": 0.6071951524633351, + "compression_loss": 69.5811996459961, + "distillation_loss": 2.6012816429138184, + "epoch": 2.89, + "learning_rate": 3.951347797501644e-05, + "loss": 71.9634, + "step": 3416, + "task_loss": 0.9923920035362244 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6505033279047514, + "compression/movement_sparsity/importance_threshold": -0.002447782011684687, + "compression/movement_sparsity/linear_layer_sparsity": 0.6293147481348694, + "compression/movement_sparsity/model_sparsity": 0.6076958505378249, + "compression_loss": 69.62565612792969, + "distillation_loss": 2.068847894668579, + "epoch": 2.89, + "learning_rate": 3.9508781816474125e-05, + "loss": 72.1662, + "step": 3417, + "task_loss": 1.2652469873428345 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6509222211946627, + "compression/movement_sparsity/importance_threshold": -0.002444848194164442, + "compression/movement_sparsity/linear_layer_sparsity": 0.6298689119015785, + "compression/movement_sparsity/model_sparsity": 0.6082309770743405, + "compression_loss": 69.67008972167969, + "distillation_loss": 4.21000862121582, + "epoch": 2.89, + "learning_rate": 3.950408565793182e-05, + "loss": 72.5042, + "step": 3418, + "task_loss": 2.495651960372925 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6513407796378373, + "compression/movement_sparsity/importance_threshold": -0.0024419167218219457, + "compression/movement_sparsity/linear_layer_sparsity": 0.6303630136359057, + "compression/movement_sparsity/model_sparsity": 0.6087081048940588, + "compression_loss": 69.71454620361328, + "distillation_loss": 2.989088535308838, + "epoch": 2.89, + "learning_rate": 3.94993894993895e-05, + "loss": 72.5831, + "step": 3419, + "task_loss": 1.8758002519607544 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6517590033681602, + "compression/movement_sparsity/importance_threshold": -0.002438987593719504, + "compression/movement_sparsity/linear_layer_sparsity": 0.6308652238042254, + "compression/movement_sparsity/model_sparsity": 0.6091930625981176, + "compression_loss": 69.75894165039062, + "distillation_loss": 3.083568572998047, + "epoch": 2.89, + "learning_rate": 3.949469334084719e-05, + "loss": 72.4935, + "step": 3420, + "task_loss": 2.7684543132781982 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6521768925195169, + "compression/movement_sparsity/importance_threshold": -0.002436060808919418, + "compression/movement_sparsity/linear_layer_sparsity": 0.6313944064397373, + "compression/movement_sparsity/model_sparsity": 0.6097040661821435, + "compression_loss": 69.80332946777344, + "distillation_loss": 2.5045700073242188, + "epoch": 2.89, + "learning_rate": 3.948999718230488e-05, + "loss": 71.8744, + "step": 3421, + "task_loss": 2.545727014541626 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6525944472257921, + "compression/movement_sparsity/importance_threshold": -0.002433136366483995, + "compression/movement_sparsity/linear_layer_sparsity": 0.631853522666221, + "compression/movement_sparsity/model_sparsity": 0.6101474103538407, + "compression_loss": 69.84757995605469, + "distillation_loss": 4.963435649871826, + "epoch": 2.89, + "learning_rate": 3.948530102376257e-05, + "loss": 73.1349, + "step": 3422, + "task_loss": 2.430263042449951 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6530116676208715, + "compression/movement_sparsity/importance_threshold": -0.002430214265475533, + "compression/movement_sparsity/linear_layer_sparsity": 0.6323447983728185, + "compression/movement_sparsity/model_sparsity": 0.6106218092285758, + "compression_loss": 69.89185333251953, + "distillation_loss": 3.4002950191497803, + "epoch": 2.89, + "learning_rate": 3.948060486522025e-05, + "loss": 72.458, + "step": 3423, + "task_loss": 1.3298418521881104 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6534285538386396, + "compression/movement_sparsity/importance_threshold": -0.002427294504956343, + "compression/movement_sparsity/linear_layer_sparsity": 0.6329355335616669, + "compression/movement_sparsity/model_sparsity": 0.6111922508463731, + "compression_loss": 69.93608856201172, + "distillation_loss": 2.6990725994110107, + "epoch": 2.89, + "learning_rate": 3.9475908706677936e-05, + "loss": 72.7111, + "step": 3424, + "task_loss": 1.5069451332092285 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.653845106012982, + "compression/movement_sparsity/importance_threshold": -0.002424377083988725, + "compression/movement_sparsity/linear_layer_sparsity": 0.6333433877914839, + "compression/movement_sparsity/model_sparsity": 0.6115860940286896, + "compression_loss": 69.98027038574219, + "distillation_loss": 4.127119541168213, + "epoch": 2.89, + "learning_rate": 3.947121254813563e-05, + "loss": 72.7007, + "step": 3425, + "task_loss": 3.101025342941284 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6542613242777837, + "compression/movement_sparsity/importance_threshold": -0.002421462001634983, + "compression/movement_sparsity/linear_layer_sparsity": 0.6338915060052016, + "compression/movement_sparsity/model_sparsity": 0.6121153826955572, + "compression_loss": 70.0244369506836, + "distillation_loss": 3.782341718673706, + "epoch": 2.9, + "learning_rate": 3.9466516389593315e-05, + "loss": 73.1741, + "step": 3426, + "task_loss": 2.8183016777038574 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6546772087669298, + "compression/movement_sparsity/importance_threshold": -0.002418549256957423, + "compression/movement_sparsity/linear_layer_sparsity": 0.6343312573834446, + "compression/movement_sparsity/model_sparsity": 0.6125400272611241, + "compression_loss": 70.06854248046875, + "distillation_loss": 2.5417325496673584, + "epoch": 2.9, + "learning_rate": 3.9461820231051e-05, + "loss": 72.8555, + "step": 3427, + "task_loss": 1.1935113668441772 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6550927596143055, + "compression/movement_sparsity/importance_threshold": -0.0024156388490183473, + "compression/movement_sparsity/linear_layer_sparsity": 0.6348840975675462, + "compression/movement_sparsity/model_sparsity": 0.6130738756841664, + "compression_loss": 70.11267852783203, + "distillation_loss": 1.734518051147461, + "epoch": 2.9, + "learning_rate": 3.945712407250869e-05, + "loss": 72.4502, + "step": 3428, + "task_loss": 0.9092952609062195 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6555079769537958, + "compression/movement_sparsity/importance_threshold": -0.0024127307768800612, + "compression/movement_sparsity/linear_layer_sparsity": 0.6353808345429209, + "compression/movement_sparsity/model_sparsity": 0.6135535482162954, + "compression_loss": 70.15677642822266, + "distillation_loss": 3.5225138664245605, + "epoch": 2.9, + "learning_rate": 3.945242791396638e-05, + "loss": 73.2771, + "step": 3429, + "task_loss": 2.2300021648406982 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6559228609192862, + "compression/movement_sparsity/importance_threshold": -0.0024098250396048662, + "compression/movement_sparsity/linear_layer_sparsity": 0.635898975399202, + "compression/movement_sparsity/model_sparsity": 0.6140538893401756, + "compression_loss": 70.20085906982422, + "distillation_loss": 1.5398763418197632, + "epoch": 2.9, + "learning_rate": 3.944773175542407e-05, + "loss": 72.5097, + "step": 3430, + "task_loss": 1.2171375751495361 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6563374116446614, + "compression/movement_sparsity/importance_threshold": -0.002406921636255069, + "compression/movement_sparsity/linear_layer_sparsity": 0.636269208880128, + "compression/movement_sparsity/model_sparsity": 0.6144114041620602, + "compression_loss": 70.24490356445312, + "distillation_loss": 3.341099977493286, + "epoch": 2.9, + "learning_rate": 3.944303559688175e-05, + "loss": 72.9376, + "step": 3431, + "task_loss": 1.224884033203125 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6567516292638069, + "compression/movement_sparsity/importance_threshold": -0.002404020565892971, + "compression/movement_sparsity/linear_layer_sparsity": 0.6368179352263951, + "compression/movement_sparsity/model_sparsity": 0.6149412800702534, + "compression_loss": 70.28887176513672, + "distillation_loss": 2.9473936557769775, + "epoch": 2.9, + "learning_rate": 3.943833943833944e-05, + "loss": 73.5411, + "step": 3432, + "task_loss": 2.231801986694336 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6571655139106075, + "compression/movement_sparsity/importance_threshold": -0.0024011218275808797, + "compression/movement_sparsity/linear_layer_sparsity": 0.6372782438696424, + "compression/movement_sparsity/model_sparsity": 0.6153857756955301, + "compression_loss": 70.33283996582031, + "distillation_loss": 3.4031615257263184, + "epoch": 2.9, + "learning_rate": 3.9433643279797126e-05, + "loss": 73.041, + "step": 3433, + "task_loss": 1.198292851448059 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6575790657189489, + "compression/movement_sparsity/importance_threshold": -0.002398225420381095, + "compression/movement_sparsity/linear_layer_sparsity": 0.6376679852888205, + "compression/movement_sparsity/model_sparsity": 0.6157621282979746, + "compression_loss": 70.37678527832031, + "distillation_loss": 2.947099447250366, + "epoch": 2.9, + "learning_rate": 3.942894712125482e-05, + "loss": 72.7191, + "step": 3434, + "task_loss": 2.070704936981201 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6579922848227153, + "compression/movement_sparsity/importance_threshold": -0.002395331343355926, + "compression/movement_sparsity/linear_layer_sparsity": 0.6380450393936341, + "compression/movement_sparsity/model_sparsity": 0.6161262294343338, + "compression_loss": 70.42066955566406, + "distillation_loss": 3.0444676876068115, + "epoch": 2.9, + "learning_rate": 3.9424250962712506e-05, + "loss": 73.0662, + "step": 3435, + "task_loss": 2.250734329223633 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6584051713557927, + "compression/movement_sparsity/importance_threshold": -0.0023924395955676712, + "compression/movement_sparsity/linear_layer_sparsity": 0.6386006936812977, + "compression/movement_sparsity/model_sparsity": 0.6166627952878236, + "compression_loss": 70.4645004272461, + "distillation_loss": 3.1259472370147705, + "epoch": 2.9, + "learning_rate": 3.941955480417019e-05, + "loss": 72.706, + "step": 3436, + "task_loss": 1.9956539869308472 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.658817725452066, + "compression/movement_sparsity/importance_threshold": -0.0023895501760786376, + "compression/movement_sparsity/linear_layer_sparsity": 0.6391136475246573, + "compression/movement_sparsity/model_sparsity": 0.6171581275886332, + "compression_loss": 70.50831604003906, + "distillation_loss": 1.299185037612915, + "epoch": 2.9, + "learning_rate": 3.941485864562788e-05, + "loss": 73.142, + "step": 3437, + "task_loss": 0.844789445400238 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6592299472454202, + "compression/movement_sparsity/importance_threshold": -0.002386663083951128, + "compression/movement_sparsity/linear_layer_sparsity": 0.6395717382727242, + "compression/movement_sparsity/model_sparsity": 0.617600481510252, + "compression_loss": 70.55211639404297, + "distillation_loss": 1.9725441932678223, + "epoch": 2.91, + "learning_rate": 3.9410162487085565e-05, + "loss": 72.8861, + "step": 3438, + "task_loss": 1.4636355638504028 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6596418368697405, + "compression/movement_sparsity/importance_threshold": -0.002383778318247447, + "compression/movement_sparsity/linear_layer_sparsity": 0.6399490666333935, + "compression/movement_sparsity/model_sparsity": 0.6179648474809345, + "compression_loss": 70.59585571289062, + "distillation_loss": 2.825838327407837, + "epoch": 2.91, + "learning_rate": 3.940546632854326e-05, + "loss": 72.5972, + "step": 3439, + "task_loss": 4.363339900970459 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.660053394458912, + "compression/movement_sparsity/importance_threshold": -0.0023808958780299006, + "compression/movement_sparsity/linear_layer_sparsity": 0.6404604464866251, + "compression/movement_sparsity/model_sparsity": 0.6184586598630192, + "compression_loss": 70.63953399658203, + "distillation_loss": 2.800205707550049, + "epoch": 2.91, + "learning_rate": 3.940077017000094e-05, + "loss": 73.1447, + "step": 3440, + "task_loss": 1.7196780443191528 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6604646201468201, + "compression/movement_sparsity/importance_threshold": -0.002378015762360788, + "compression/movement_sparsity/linear_layer_sparsity": 0.6409308548998599, + "compression/movement_sparsity/model_sparsity": 0.6189129083001138, + "compression_loss": 70.68316650390625, + "distillation_loss": 3.213515520095825, + "epoch": 2.91, + "learning_rate": 3.939607401145863e-05, + "loss": 73.4888, + "step": 3441, + "task_loss": 2.6564600467681885 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6608755140673495, + "compression/movement_sparsity/importance_threshold": -0.0023751379703024185, + "compression/movement_sparsity/linear_layer_sparsity": 0.641431216822196, + "compression/movement_sparsity/model_sparsity": 0.6193960812511243, + "compression_loss": 70.72674560546875, + "distillation_loss": 1.7173619270324707, + "epoch": 2.91, + "learning_rate": 3.939137785291632e-05, + "loss": 73.1429, + "step": 3442, + "task_loss": 0.6602032780647278 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.661286076354386, + "compression/movement_sparsity/importance_threshold": -0.00237226250091709, + "compression/movement_sparsity/linear_layer_sparsity": 0.6420269840097866, + "compression/movement_sparsity/model_sparsity": 0.6199713820030269, + "compression_loss": 70.770263671875, + "distillation_loss": 3.48776912689209, + "epoch": 2.91, + "learning_rate": 3.9386681694374e-05, + "loss": 73.933, + "step": 3443, + "task_loss": 1.7559863328933716 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6616963071418138, + "compression/movement_sparsity/importance_threshold": -0.002369389353267113, + "compression/movement_sparsity/linear_layer_sparsity": 0.6425734328400353, + "compression/movement_sparsity/model_sparsity": 0.6204990586348834, + "compression_loss": 70.81375885009766, + "distillation_loss": 2.4312257766723633, + "epoch": 2.91, + "learning_rate": 3.938198553583169e-05, + "loss": 73.2838, + "step": 3444, + "task_loss": 1.474777340888977 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.662106206563519, + "compression/movement_sparsity/importance_threshold": -0.0023665185264147856, + "compression/movement_sparsity/linear_layer_sparsity": 0.6430591876570174, + "compression/movement_sparsity/model_sparsity": 0.6209681262795456, + "compression_loss": 70.85722351074219, + "distillation_loss": 2.045888900756836, + "epoch": 2.91, + "learning_rate": 3.9377289377289376e-05, + "loss": 74.4556, + "step": 3445, + "task_loss": 1.9601024389266968 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.662515774753386, + "compression/movement_sparsity/importance_threshold": -0.002363650019422418, + "compression/movement_sparsity/linear_layer_sparsity": 0.6435720341828682, + "compression/movement_sparsity/model_sparsity": 0.621463354949533, + "compression_loss": 70.90070343017578, + "distillation_loss": 3.6094117164611816, + "epoch": 2.91, + "learning_rate": 3.937259321874707e-05, + "loss": 73.9896, + "step": 3446, + "task_loss": 2.087212324142456 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6629250118453004, + "compression/movement_sparsity/importance_threshold": -0.002360783831352309, + "compression/movement_sparsity/linear_layer_sparsity": 0.6440765934122121, + "compression/movement_sparsity/model_sparsity": 0.6219505810171432, + "compression_loss": 70.94407653808594, + "distillation_loss": 3.968432664871216, + "epoch": 2.91, + "learning_rate": 3.9367897060204755e-05, + "loss": 73.4152, + "step": 3447, + "task_loss": 2.3962337970733643 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6633339179731472, + "compression/movement_sparsity/importance_threshold": -0.002357919961266764, + "compression/movement_sparsity/linear_layer_sparsity": 0.6445810453240474, + "compression/movement_sparsity/model_sparsity": 0.6224377034539312, + "compression_loss": 70.98741149902344, + "distillation_loss": 3.249356746673584, + "epoch": 2.91, + "learning_rate": 3.936320090166244e-05, + "loss": 73.7963, + "step": 3448, + "task_loss": 1.4402354955673218 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6637424932708114, + "compression/movement_sparsity/importance_threshold": -0.0023550584082280886, + "compression/movement_sparsity/linear_layer_sparsity": 0.6450214883040133, + "compression/movement_sparsity/model_sparsity": 0.6228630158625742, + "compression_loss": 71.03071594238281, + "distillation_loss": 4.58958625793457, + "epoch": 2.91, + "learning_rate": 3.935850474312013e-05, + "loss": 74.2047, + "step": 3449, + "task_loss": 2.3297276496887207 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6641507378721785, + "compression/movement_sparsity/importance_threshold": -0.002352199171298584, + "compression/movement_sparsity/linear_layer_sparsity": 0.6455775360892088, + "compression/movement_sparsity/model_sparsity": 0.6233999616957452, + "compression_loss": 71.07402801513672, + "distillation_loss": 3.513432025909424, + "epoch": 2.92, + "learning_rate": 3.9353808584577814e-05, + "loss": 73.5117, + "step": 3450, + "task_loss": 1.4783369302749634 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6645586519111331, + "compression/movement_sparsity/importance_threshold": -0.0023493422495405574, + "compression/movement_sparsity/linear_layer_sparsity": 0.6459419744246637, + "compression/movement_sparsity/model_sparsity": 0.6237518804532338, + "compression_loss": 71.1172866821289, + "distillation_loss": 3.0493850708007812, + "epoch": 2.92, + "learning_rate": 3.934911242603551e-05, + "loss": 74.1033, + "step": 3451, + "task_loss": 2.0505120754241943 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6649662355215611, + "compression/movement_sparsity/importance_threshold": -0.0023464876420163077, + "compression/movement_sparsity/linear_layer_sparsity": 0.6464074104599943, + "compression/movement_sparsity/model_sparsity": 0.6242013273289021, + "compression_loss": 71.16053009033203, + "distillation_loss": 2.648249387741089, + "epoch": 2.92, + "learning_rate": 3.9344416267493194e-05, + "loss": 74.2024, + "step": 3452, + "task_loss": 1.5979214906692505 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6653734888373467, + "compression/movement_sparsity/importance_threshold": -0.0023436353477881465, + "compression/movement_sparsity/linear_layer_sparsity": 0.646936962744703, + "compression/movement_sparsity/model_sparsity": 0.6247126878635376, + "compression_loss": 71.20372772216797, + "distillation_loss": 1.356490135192871, + "epoch": 2.92, + "learning_rate": 3.933972010895088e-05, + "loss": 73.8793, + "step": 3453, + "task_loss": 1.0951056480407715 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6657804119923759, + "compression/movement_sparsity/importance_threshold": -0.002340785365918371, + "compression/movement_sparsity/linear_layer_sparsity": 0.6474527903124628, + "compression/movement_sparsity/model_sparsity": 0.6252107951674737, + "compression_loss": 71.24679565429688, + "distillation_loss": 3.1165103912353516, + "epoch": 2.92, + "learning_rate": 3.9335023950408567e-05, + "loss": 74.1133, + "step": 3454, + "task_loss": 2.2923457622528076 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6661870051205332, + "compression/movement_sparsity/importance_threshold": -0.002337937695469289, + "compression/movement_sparsity/linear_layer_sparsity": 0.6479854190324216, + "compression/movement_sparsity/model_sparsity": 0.6257251264523442, + "compression_loss": 71.28990936279297, + "distillation_loss": 3.222261428833008, + "epoch": 2.92, + "learning_rate": 3.933032779186625e-05, + "loss": 74.6906, + "step": 3455, + "task_loss": 2.9152517318725586 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6665932683557042, + "compression/movement_sparsity/importance_threshold": -0.002335092335503202, + "compression/movement_sparsity/linear_layer_sparsity": 0.6484795684634194, + "compression/movement_sparsity/model_sparsity": 0.6262023003302059, + "compression_loss": 71.33300018310547, + "distillation_loss": 2.3612823486328125, + "epoch": 2.92, + "learning_rate": 3.9325631633323946e-05, + "loss": 73.6173, + "step": 3456, + "task_loss": 2.113659381866455 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6669992018317739, + "compression/movement_sparsity/importance_threshold": -0.002332249285082416, + "compression/movement_sparsity/linear_layer_sparsity": 0.6487986830376903, + "compression/movement_sparsity/model_sparsity": 0.6265104523371393, + "compression_loss": 71.37603759765625, + "distillation_loss": 3.4456934928894043, + "epoch": 2.92, + "learning_rate": 3.9320935474781626e-05, + "loss": 74.3593, + "step": 3457, + "task_loss": 1.8562883138656616 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6674048056826272, + "compression/movement_sparsity/importance_threshold": -0.002329408543269234, + "compression/movement_sparsity/linear_layer_sparsity": 0.6493443925695456, + "compression/movement_sparsity/model_sparsity": 0.6270374150677764, + "compression_loss": 71.41908264160156, + "distillation_loss": 3.216428279876709, + "epoch": 2.92, + "learning_rate": 3.931623931623932e-05, + "loss": 74.2239, + "step": 3458, + "task_loss": 2.017503261566162 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6678100800421496, + "compression/movement_sparsity/importance_threshold": -0.002326570109125959, + "compression/movement_sparsity/linear_layer_sparsity": 0.6498514439499253, + "compression/movement_sparsity/model_sparsity": 0.6275270476733678, + "compression_loss": 71.4620590209961, + "distillation_loss": 2.425304412841797, + "epoch": 2.92, + "learning_rate": 3.9311543157697005e-05, + "loss": 74.193, + "step": 3459, + "task_loss": 1.093080997467041 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6682150250442258, + "compression/movement_sparsity/importance_threshold": -0.0023237339817148983, + "compression/movement_sparsity/linear_layer_sparsity": 0.6502760635594385, + "compression/movement_sparsity/model_sparsity": 0.6279370802930113, + "compression_loss": 71.50495147705078, + "distillation_loss": 2.074154853820801, + "epoch": 2.92, + "learning_rate": 3.93068469991547e-05, + "loss": 73.8348, + "step": 3460, + "task_loss": 1.9899340867996216 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6686196408227416, + "compression/movement_sparsity/importance_threshold": -0.002320900160098352, + "compression/movement_sparsity/linear_layer_sparsity": 0.6507725978239635, + "compression/movement_sparsity/model_sparsity": 0.6284165570780318, + "compression_loss": 71.54790496826172, + "distillation_loss": 1.9942905902862549, + "epoch": 2.93, + "learning_rate": 3.930215084061238e-05, + "loss": 73.8606, + "step": 3461, + "task_loss": 1.4400956630706787 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6690239275115815, + "compression/movement_sparsity/importance_threshold": -0.0023180686433386267, + "compression/movement_sparsity/linear_layer_sparsity": 0.6511622915464711, + "compression/movement_sparsity/model_sparsity": 0.6287928636223332, + "compression_loss": 71.59075927734375, + "distillation_loss": 1.6505866050720215, + "epoch": 2.93, + "learning_rate": 3.9297454682070064e-05, + "loss": 74.2581, + "step": 3462, + "task_loss": 1.370280385017395 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6694278852446311, + "compression/movement_sparsity/importance_threshold": -0.0023152394304980237, + "compression/movement_sparsity/linear_layer_sparsity": 0.6516874318891545, + "compression/movement_sparsity/model_sparsity": 0.6292999637787248, + "compression_loss": 71.63359069824219, + "distillation_loss": 1.883472204208374, + "epoch": 2.93, + "learning_rate": 3.929275852352776e-05, + "loss": 74.1864, + "step": 3463, + "task_loss": 0.7973963618278503 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6698315141557751, + "compression/movement_sparsity/importance_threshold": -0.0023124125206388513, + "compression/movement_sparsity/linear_layer_sparsity": 0.6521459757555915, + "compression/movement_sparsity/model_sparsity": 0.6297427552527038, + "compression_loss": 71.67642974853516, + "distillation_loss": 1.860454797744751, + "epoch": 2.93, + "learning_rate": 3.9288062364985443e-05, + "loss": 73.6535, + "step": 3464, + "task_loss": 1.5932941436767578 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6702348143788992, + "compression/movement_sparsity/importance_threshold": -0.002309587912823408, + "compression/movement_sparsity/linear_layer_sparsity": 0.6527133634242088, + "compression/movement_sparsity/model_sparsity": 0.6302906514094154, + "compression_loss": 71.71917724609375, + "distillation_loss": 3.1045479774475098, + "epoch": 2.93, + "learning_rate": 3.9283366206443137e-05, + "loss": 73.9588, + "step": 3465, + "task_loss": 1.8902684450149536 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6706377860478878, + "compression/movement_sparsity/importance_threshold": -0.0023067656061140033, + "compression/movement_sparsity/linear_layer_sparsity": 0.653129338012186, + "compression/movement_sparsity/model_sparsity": 0.630692335990608, + "compression_loss": 71.76187133789062, + "distillation_loss": 2.717440128326416, + "epoch": 2.93, + "learning_rate": 3.9278670047900816e-05, + "loss": 74.422, + "step": 3466, + "task_loss": 1.708390474319458 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6710404292966268, + "compression/movement_sparsity/importance_threshold": -0.0023039455995729373, + "compression/movement_sparsity/linear_layer_sparsity": 0.6534962446503414, + "compression/movement_sparsity/model_sparsity": 0.631046638257006, + "compression_loss": 71.80453491210938, + "distillation_loss": 2.693601369857788, + "epoch": 2.93, + "learning_rate": 3.927397388935851e-05, + "loss": 74.4821, + "step": 3467, + "task_loss": 1.571959137916565 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.671442744259001, + "compression/movement_sparsity/importance_threshold": -0.0023011278922625148, + "compression/movement_sparsity/linear_layer_sparsity": 0.6539202561273051, + "compression/movement_sparsity/model_sparsity": 0.6314560836353241, + "compression_loss": 71.84719848632812, + "distillation_loss": 2.535973072052002, + "epoch": 2.93, + "learning_rate": 3.9269277730816196e-05, + "loss": 74.7812, + "step": 3468, + "task_loss": 1.4639322757720947 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6718447310688952, + "compression/movement_sparsity/importance_threshold": -0.0022983124832450423, + "compression/movement_sparsity/linear_layer_sparsity": 0.654498888286003, + "compression/movement_sparsity/model_sparsity": 0.63201483799929, + "compression_loss": 71.8897933959961, + "distillation_loss": 2.520115375518799, + "epoch": 2.93, + "learning_rate": 3.926458157227388e-05, + "loss": 74.273, + "step": 3469, + "task_loss": 1.4775841236114502 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6722463898601954, + "compression/movement_sparsity/importance_threshold": -0.0022954993715828187, + "compression/movement_sparsity/linear_layer_sparsity": 0.654968664718353, + "compression/movement_sparsity/model_sparsity": 0.6324684761659876, + "compression_loss": 71.93238067626953, + "distillation_loss": 3.3124871253967285, + "epoch": 2.93, + "learning_rate": 3.925988541373157e-05, + "loss": 74.7481, + "step": 3470, + "task_loss": 1.8966056108474731 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6726477207667857, + "compression/movement_sparsity/importance_threshold": -0.0022926885563381537, + "compression/movement_sparsity/linear_layer_sparsity": 0.6553491649076134, + "compression/movement_sparsity/model_sparsity": 0.6328359050031913, + "compression_loss": 71.97491455078125, + "distillation_loss": 1.986694574356079, + "epoch": 2.93, + "learning_rate": 3.9255189255189255e-05, + "loss": 74.3775, + "step": 3471, + "task_loss": 1.648299217224121 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6730487239225522, + "compression/movement_sparsity/importance_threshold": -0.002289880036573346, + "compression/movement_sparsity/linear_layer_sparsity": 0.6558457826413119, + "compression/movement_sparsity/model_sparsity": 0.6333154623899624, + "compression_loss": 72.01742553710938, + "distillation_loss": 2.9936611652374268, + "epoch": 2.93, + "learning_rate": 3.925049309664695e-05, + "loss": 75.2096, + "step": 3472, + "task_loss": 2.2089285850524902 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6734493994613793, + "compression/movement_sparsity/importance_threshold": -0.002287073811350704, + "compression/movement_sparsity/linear_layer_sparsity": 0.6562686732465177, + "compression/movement_sparsity/model_sparsity": 0.6337238254019157, + "compression_loss": 72.05985260009766, + "distillation_loss": 1.813371181488037, + "epoch": 2.94, + "learning_rate": 3.9245796938104634e-05, + "loss": 74.6188, + "step": 3473, + "task_loss": 1.2873101234436035 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6738497475171527, + "compression/movement_sparsity/importance_threshold": -0.0022842698797325284, + "compression/movement_sparsity/linear_layer_sparsity": 0.6568422734064733, + "compression/movement_sparsity/model_sparsity": 0.6342777206317763, + "compression_loss": 72.10228729248047, + "distillation_loss": 3.282653331756592, + "epoch": 2.94, + "learning_rate": 3.924110077956232e-05, + "loss": 75.2994, + "step": 3474, + "task_loss": 1.5861778259277344 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6742497682237569, + "compression/movement_sparsity/importance_threshold": -0.002281468240781127, + "compression/movement_sparsity/linear_layer_sparsity": 0.6572695878778722, + "compression/movement_sparsity/model_sparsity": 0.6346903555365095, + "compression_loss": 72.14468383789062, + "distillation_loss": 3.0566153526306152, + "epoch": 2.94, + "learning_rate": 3.923640462102001e-05, + "loss": 74.8987, + "step": 3475, + "task_loss": 2.2819125652313232 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6746494617150778, + "compression/movement_sparsity/importance_threshold": -0.002278668893558799, + "compression/movement_sparsity/linear_layer_sparsity": 0.6577697351651908, + "compression/movement_sparsity/model_sparsity": 0.6351733212258757, + "compression_loss": 72.18702697753906, + "distillation_loss": 3.2139034271240234, + "epoch": 2.94, + "learning_rate": 3.923170846247769e-05, + "loss": 74.7968, + "step": 3476, + "task_loss": 1.3146706819534302 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.675048828125, + "compression/movement_sparsity/importance_threshold": -0.002275871837127852, + "compression/movement_sparsity/linear_layer_sparsity": 0.6582559669488783, + "compression/movement_sparsity/model_sparsity": 0.6356428494519697, + "compression_loss": 72.22935485839844, + "distillation_loss": 2.0855443477630615, + "epoch": 2.94, + "learning_rate": 3.9227012303935386e-05, + "loss": 74.7817, + "step": 3477, + "task_loss": 1.0672084093093872 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6754478675874089, + "compression/movement_sparsity/importance_threshold": -0.002273077070550587, + "compression/movement_sparsity/linear_layer_sparsity": 0.6587030039935469, + "compression/movement_sparsity/model_sparsity": 0.6360745293989071, + "compression_loss": 72.2716064453125, + "distillation_loss": 2.6618552207946777, + "epoch": 2.94, + "learning_rate": 3.9222316145393066e-05, + "loss": 75.107, + "step": 3478, + "task_loss": 1.771591305732727 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6758465802361895, + "compression/movement_sparsity/importance_threshold": -0.0022702845928893114, + "compression/movement_sparsity/linear_layer_sparsity": 0.6591643381152107, + "compression/movement_sparsity/model_sparsity": 0.636520015274262, + "compression_loss": 72.31391906738281, + "distillation_loss": 2.6960034370422363, + "epoch": 2.94, + "learning_rate": 3.921761998685076e-05, + "loss": 74.866, + "step": 3479, + "task_loss": 1.9516239166259766 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6762449662052268, + "compression/movement_sparsity/importance_threshold": -0.0022674944032063287, + "compression/movement_sparsity/linear_layer_sparsity": 0.6596007984231862, + "compression/movement_sparsity/model_sparsity": 0.6369414818279496, + "compression_loss": 72.35610961914062, + "distillation_loss": 3.267400026321411, + "epoch": 2.94, + "learning_rate": 3.9212923828308445e-05, + "loss": 74.549, + "step": 3480, + "task_loss": 2.1141860485076904 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6766430256284065, + "compression/movement_sparsity/importance_threshold": -0.0022647065005639384, + "compression/movement_sparsity/linear_layer_sparsity": 0.660014292784295, + "compression/movement_sparsity/model_sparsity": 0.6373407713856969, + "compression_loss": 72.3982925415039, + "distillation_loss": 4.121175289154053, + "epoch": 2.94, + "learning_rate": 3.920822766976613e-05, + "loss": 75.1532, + "step": 3481, + "task_loss": 2.6890432834625244 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.677040758639613, + "compression/movement_sparsity/importance_threshold": -0.0022619208840244506, + "compression/movement_sparsity/linear_layer_sparsity": 0.66050336251988, + "compression/movement_sparsity/model_sparsity": 0.63781304007131, + "compression_loss": 72.44044494628906, + "distillation_loss": 2.5138580799102783, + "epoch": 2.94, + "learning_rate": 3.9203531511223825e-05, + "loss": 74.9983, + "step": 3482, + "task_loss": 1.8378770351409912 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6774381653727322, + "compression/movement_sparsity/importance_threshold": -0.002259137552650164, + "compression/movement_sparsity/linear_layer_sparsity": 0.661125935236316, + "compression/movement_sparsity/model_sparsity": 0.6384142254996787, + "compression_loss": 72.4825210571289, + "distillation_loss": 3.3643221855163574, + "epoch": 2.94, + "learning_rate": 3.9198835352681504e-05, + "loss": 75.5401, + "step": 3483, + "task_loss": 1.4657042026519775 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6778352459616486, + "compression/movement_sparsity/importance_threshold": -0.0022563565055033867, + "compression/movement_sparsity/linear_layer_sparsity": 0.6614776504849035, + "compression/movement_sparsity/model_sparsity": 0.6387538582474745, + "compression_loss": 72.52464294433594, + "distillation_loss": 3.1803340911865234, + "epoch": 2.94, + "learning_rate": 3.91941391941392e-05, + "loss": 75.2395, + "step": 3484, + "task_loss": 1.7452113628387451 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6782320005402478, + "compression/movement_sparsity/importance_threshold": -0.0022535777416464193, + "compression/movement_sparsity/linear_layer_sparsity": 0.6619202636633791, + "compression/movement_sparsity/model_sparsity": 0.6391812663016321, + "compression_loss": 72.56664276123047, + "distillation_loss": 3.8766181468963623, + "epoch": 2.95, + "learning_rate": 3.9189443035596884e-05, + "loss": 75.2579, + "step": 3485, + "task_loss": 2.7921416759490967 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6786284292424143, + "compression/movement_sparsity/importance_threshold": -0.002250801260141571, + "compression/movement_sparsity/linear_layer_sparsity": 0.6623848292344723, + "compression/movement_sparsity/model_sparsity": 0.6396298726161874, + "compression_loss": 72.60862731933594, + "distillation_loss": 1.689774513244629, + "epoch": 2.95, + "learning_rate": 3.918474687705457e-05, + "loss": 75.1176, + "step": 3486, + "task_loss": 1.2145459651947021 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6790245322020341, + "compression/movement_sparsity/importance_threshold": -0.002248027060051138, + "compression/movement_sparsity/linear_layer_sparsity": 0.6628492755638892, + "compression/movement_sparsity/model_sparsity": 0.6400783637853847, + "compression_loss": 72.65055847167969, + "distillation_loss": 2.478976011276245, + "epoch": 2.95, + "learning_rate": 3.9180050718512256e-05, + "loss": 75.03, + "step": 3487, + "task_loss": 2.0646066665649414 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6794203095529917, + "compression/movement_sparsity/importance_threshold": -0.0022452551404374306, + "compression/movement_sparsity/linear_layer_sparsity": 0.6632070602138032, + "compression/movement_sparsity/model_sparsity": 0.6404238574319, + "compression_loss": 72.69247436523438, + "distillation_loss": 2.3888416290283203, + "epoch": 2.95, + "learning_rate": 3.917535455996994e-05, + "loss": 75.2002, + "step": 3488, + "task_loss": 1.6714826822280884 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6798157614291724, + "compression/movement_sparsity/importance_threshold": -0.002242485500362751, + "compression/movement_sparsity/linear_layer_sparsity": 0.6638220491596228, + "compression/movement_sparsity/model_sparsity": 0.6410177196155034, + "compression_loss": 72.7343521118164, + "distillation_loss": 3.100355863571167, + "epoch": 2.95, + "learning_rate": 3.9170658401427636e-05, + "loss": 75.6369, + "step": 3489, + "task_loss": 2.152700185775757 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6802108879644617, + "compression/movement_sparsity/importance_threshold": -0.0022397181388894016, + "compression/movement_sparsity/linear_layer_sparsity": 0.6644480321880027, + "compression/movement_sparsity/model_sparsity": 0.6416221982011094, + "compression_loss": 72.77621459960938, + "distillation_loss": 3.2960681915283203, + "epoch": 2.95, + "learning_rate": 3.916596224288532e-05, + "loss": 75.6987, + "step": 3490, + "task_loss": 1.9321776628494263 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6806056892927441, + "compression/movement_sparsity/importance_threshold": -0.002236953055079689, + "compression/movement_sparsity/linear_layer_sparsity": 0.6647963490488139, + "compression/movement_sparsity/model_sparsity": 0.6419585493062038, + "compression_loss": 72.81808471679688, + "distillation_loss": 3.720386028289795, + "epoch": 2.95, + "learning_rate": 3.916126608434301e-05, + "loss": 76.0369, + "step": 3491, + "task_loss": 2.1342122554779053 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6810001655479054, + "compression/movement_sparsity/importance_threshold": -0.002234190247995914, + "compression/movement_sparsity/linear_layer_sparsity": 0.6652792062930606, + "compression/movement_sparsity/model_sparsity": 0.6424248189186679, + "compression_loss": 72.85981750488281, + "distillation_loss": 4.349209785461426, + "epoch": 2.95, + "learning_rate": 3.9156569925800695e-05, + "loss": 76.289, + "step": 3492, + "task_loss": 1.3346827030181885 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6813943168638301, + "compression/movement_sparsity/importance_threshold": -0.002231429716700385, + "compression/movement_sparsity/linear_layer_sparsity": 0.6656481996605523, + "compression/movement_sparsity/model_sparsity": 0.6427811362288299, + "compression_loss": 72.9016342163086, + "distillation_loss": 2.6415183544158936, + "epoch": 2.95, + "learning_rate": 3.915187376725839e-05, + "loss": 75.8007, + "step": 3493, + "task_loss": 1.3116129636764526 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.681788143374404, + "compression/movement_sparsity/importance_threshold": -0.0022286714602554004, + "compression/movement_sparsity/linear_layer_sparsity": 0.6661606407647035, + "compression/movement_sparsity/model_sparsity": 0.6432759734046003, + "compression_loss": 72.943359375, + "distillation_loss": 3.14553165435791, + "epoch": 2.95, + "learning_rate": 3.9147177608716074e-05, + "loss": 75.9407, + "step": 3494, + "task_loss": 2.4717612266540527 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6821816452135114, + "compression/movement_sparsity/importance_threshold": -0.00222591547772327, + "compression/movement_sparsity/linear_layer_sparsity": 0.6666884401967698, + "compression/movement_sparsity/model_sparsity": 0.6437856413024741, + "compression_loss": 72.98501586914062, + "distillation_loss": 2.3049697875976562, + "epoch": 2.95, + "learning_rate": 3.914248145017376e-05, + "loss": 75.765, + "step": 3495, + "task_loss": 1.2274569272994995 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6825748225150383, + "compression/movement_sparsity/importance_threshold": -0.002223161768166294, + "compression/movement_sparsity/linear_layer_sparsity": 0.6671546274546615, + "compression/movement_sparsity/model_sparsity": 0.6442358135938975, + "compression_loss": 73.02664947509766, + "distillation_loss": 5.13464879989624, + "epoch": 2.95, + "learning_rate": 3.913778529163145e-05, + "loss": 76.3102, + "step": 3496, + "task_loss": 2.374159097671509 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6829676754128695, + "compression/movement_sparsity/importance_threshold": -0.0022204103306467764, + "compression/movement_sparsity/linear_layer_sparsity": 0.6676211128167441, + "compression/movement_sparsity/model_sparsity": 0.6446862737487157, + "compression_loss": 73.06830596923828, + "distillation_loss": 3.086169719696045, + "epoch": 2.96, + "learning_rate": 3.913308913308913e-05, + "loss": 76.0823, + "step": 3497, + "task_loss": 1.9866819381713867 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.68336020404089, + "compression/movement_sparsity/importance_threshold": -0.0022176611642270227, + "compression/movement_sparsity/linear_layer_sparsity": 0.6681135570917699, + "compression/movement_sparsity/model_sparsity": 0.6451618010479586, + "compression_loss": 73.10980224609375, + "distillation_loss": 2.7235374450683594, + "epoch": 2.96, + "learning_rate": 3.9128392974546826e-05, + "loss": 75.8763, + "step": 3498, + "task_loss": 2.1649608612060547 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6837524085329851, + "compression/movement_sparsity/importance_threshold": -0.002214914267969336, + "compression/movement_sparsity/linear_layer_sparsity": 0.6685471436753452, + "compression/movement_sparsity/model_sparsity": 0.6455804925985197, + "compression_loss": 73.15141296386719, + "distillation_loss": 3.235076427459717, + "epoch": 2.96, + "learning_rate": 3.912369681600451e-05, + "loss": 76.1227, + "step": 3499, + "task_loss": 2.4782118797302246 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6841442890230398, + "compression/movement_sparsity/importance_threshold": -0.002212169640936022, + "compression/movement_sparsity/linear_layer_sparsity": 0.6690345321032934, + "compression/movement_sparsity/model_sparsity": 0.6460511377345858, + "compression_loss": 73.19290161132812, + "distillation_loss": 3.722426414489746, + "epoch": 2.96, + "learning_rate": 3.91190006574622e-05, + "loss": 75.576, + "step": 3500, + "task_loss": 2.0049216747283936 + }, + { + "epoch": 2.96, + "eval_accuracy": 0.695089108910891, + "eval_loss": 75.57647705078125, + "eval_runtime": 227.9882, + "eval_samples_per_second": 110.751, + "eval_steps_per_second": 0.868, + "step": 3500 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6845358456449396, + "compression/movement_sparsity/importance_threshold": -0.0022094272821893803, + "compression/movement_sparsity/linear_layer_sparsity": 0.6696490202340724, + "compression/movement_sparsity/model_sparsity": 0.6466445163076858, + "compression_loss": 73.23440551757812, + "distillation_loss": 2.712944507598877, + "epoch": 2.96, + "learning_rate": 3.9114304498919885e-05, + "loss": 76.3992, + "step": 3501, + "task_loss": 1.5083972215652466 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6849270785325692, + "compression/movement_sparsity/importance_threshold": -0.00220668719079172, + "compression/movement_sparsity/linear_layer_sparsity": 0.6700302477975585, + "compression/movement_sparsity/model_sparsity": 0.6470126475315732, + "compression_loss": 73.27590942382812, + "distillation_loss": 3.7012975215911865, + "epoch": 2.96, + "learning_rate": 3.910960834037757e-05, + "loss": 76.4017, + "step": 3502, + "task_loss": 2.568208694458008 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6853179878198141, + "compression/movement_sparsity/importance_threshold": -0.002203949365805341, + "compression/movement_sparsity/linear_layer_sparsity": 0.6703995750417441, + "compression/movement_sparsity/model_sparsity": 0.6473692872487374, + "compression_loss": 73.31735229492188, + "distillation_loss": 3.4675464630126953, + "epoch": 2.96, + "learning_rate": 3.9104912181835265e-05, + "loss": 76.1554, + "step": 3503, + "task_loss": 1.3452204465866089 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6857085736405589, + "compression/movement_sparsity/importance_threshold": -0.0022012138062925523, + "compression/movement_sparsity/linear_layer_sparsity": 0.6708060579922831, + "compression/movement_sparsity/model_sparsity": 0.6477618062594376, + "compression_loss": 73.35868835449219, + "distillation_loss": 3.877480983734131, + "epoch": 2.96, + "learning_rate": 3.9100216023292944e-05, + "loss": 76.3187, + "step": 3504, + "task_loss": 2.494652271270752 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6860988361286895, + "compression/movement_sparsity/importance_threshold": -0.002198480511315652, + "compression/movement_sparsity/linear_layer_sparsity": 0.6710963399292105, + "compression/movement_sparsity/model_sparsity": 0.6480421161188198, + "compression_loss": 73.40009307861328, + "distillation_loss": 3.388078212738037, + "epoch": 2.96, + "learning_rate": 3.909551986475064e-05, + "loss": 76.2064, + "step": 3505, + "task_loss": 1.7386434078216553 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6864887754180904, + "compression/movement_sparsity/importance_threshold": -0.002195749479936948, + "compression/movement_sparsity/linear_layer_sparsity": 0.6715159036916459, + "compression/movement_sparsity/model_sparsity": 0.6484472665752865, + "compression_loss": 73.44144439697266, + "distillation_loss": 2.8493294715881348, + "epoch": 2.96, + "learning_rate": 3.9090823706208324e-05, + "loss": 76.7381, + "step": 3506, + "task_loss": 2.541344404220581 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6868783916426471, + "compression/movement_sparsity/importance_threshold": -0.002193020711218743, + "compression/movement_sparsity/linear_layer_sparsity": 0.6719584930217863, + "compression/movement_sparsity/model_sparsity": 0.6488746516003725, + "compression_loss": 73.4827651977539, + "distillation_loss": 2.8282599449157715, + "epoch": 2.96, + "learning_rate": 3.908612754766601e-05, + "loss": 76.5251, + "step": 3507, + "task_loss": 1.9676721096038818 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6872676849362445, + "compression/movement_sparsity/importance_threshold": -0.002190294204223342, + "compression/movement_sparsity/linear_layer_sparsity": 0.6723945479080622, + "compression/movement_sparsity/model_sparsity": 0.649295726659843, + "compression_loss": 73.52401733398438, + "distillation_loss": 2.9349255561828613, + "epoch": 2.96, + "learning_rate": 3.90814313891237e-05, + "loss": 76.374, + "step": 3508, + "task_loss": 1.529220461845398 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6876566554327679, + "compression/movement_sparsity/importance_threshold": -0.002187569958013047, + "compression/movement_sparsity/linear_layer_sparsity": 0.6728441009521019, + "compression/movement_sparsity/model_sparsity": 0.6497298361738331, + "compression_loss": 73.56525421142578, + "distillation_loss": 2.720327377319336, + "epoch": 2.97, + "learning_rate": 3.907673523058138e-05, + "loss": 76.1778, + "step": 3509, + "task_loss": 2.768578052520752 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6880453032661026, + "compression/movement_sparsity/importance_threshold": -0.0021848479716501624, + "compression/movement_sparsity/linear_layer_sparsity": 0.6732191756450879, + "compression/movement_sparsity/model_sparsity": 0.6500920258972503, + "compression_loss": 73.60649108886719, + "distillation_loss": 4.758685111999512, + "epoch": 2.97, + "learning_rate": 3.9072039072039076e-05, + "loss": 77.0673, + "step": 3510, + "task_loss": 3.797227382659912 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6884336285701332, + "compression/movement_sparsity/importance_threshold": -0.0021821282441969944, + "compression/movement_sparsity/linear_layer_sparsity": 0.6737286476314979, + "compression/movement_sparsity/model_sparsity": 0.6505839959536079, + "compression_loss": 73.64765930175781, + "distillation_loss": 2.2860989570617676, + "epoch": 2.97, + "learning_rate": 3.906734291349676e-05, + "loss": 76.3011, + "step": 3511, + "task_loss": 3.141240358352661 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6888216314787456, + "compression/movement_sparsity/importance_threshold": -0.0021794107747158432, + "compression/movement_sparsity/linear_layer_sparsity": 0.6742102170656398, + "compression/movement_sparsity/model_sparsity": 0.6510490219962062, + "compression_loss": 73.68878173828125, + "distillation_loss": 2.670865535736084, + "epoch": 2.97, + "learning_rate": 3.906264675495445e-05, + "loss": 76.0113, + "step": 3512, + "task_loss": 1.9518189430236816 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6892093121258241, + "compression/movement_sparsity/importance_threshold": -0.002176695562269018, + "compression/movement_sparsity/linear_layer_sparsity": 0.6746543088409023, + "compression/movement_sparsity/model_sparsity": 0.6514778578528023, + "compression_loss": 73.72982025146484, + "distillation_loss": 3.69225811958313, + "epoch": 2.97, + "learning_rate": 3.9057950596412135e-05, + "loss": 76.7323, + "step": 3513, + "task_loss": 2.1744210720062256 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6895966706452548, + "compression/movement_sparsity/importance_threshold": -0.0021739826059188165, + "compression/movement_sparsity/linear_layer_sparsity": 0.6749810668066277, + "compression/movement_sparsity/model_sparsity": 0.6517933906771801, + "compression_loss": 73.77082824707031, + "distillation_loss": 4.02522087097168, + "epoch": 2.97, + "learning_rate": 3.905325443786982e-05, + "loss": 76.1552, + "step": 3514, + "task_loss": 2.1607179641723633 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6899837071709218, + "compression/movement_sparsity/importance_threshold": -0.0021712719047275487, + "compression/movement_sparsity/linear_layer_sparsity": 0.6754061514586787, + "compression/movement_sparsity/model_sparsity": 0.6522038723637197, + "compression_loss": 73.81180572509766, + "distillation_loss": 1.8403013944625854, + "epoch": 2.97, + "learning_rate": 3.9048558279327515e-05, + "loss": 76.755, + "step": 3515, + "task_loss": 1.2503207921981812 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.690370421836711, + "compression/movement_sparsity/importance_threshold": -0.0021685634577575143, + "compression/movement_sparsity/linear_layer_sparsity": 0.6759023995431803, + "compression/movement_sparsity/model_sparsity": 0.6526830727998811, + "compression_loss": 73.85281372070312, + "distillation_loss": 4.190285682678223, + "epoch": 2.97, + "learning_rate": 3.90438621207852e-05, + "loss": 76.5528, + "step": 3516, + "task_loss": 2.2722229957580566 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6907568147765073, + "compression/movement_sparsity/importance_threshold": -0.0021658572640710187, + "compression/movement_sparsity/linear_layer_sparsity": 0.6763273888018901, + "compression/movement_sparsity/model_sparsity": 0.6530934623701343, + "compression_loss": 73.89366149902344, + "distillation_loss": 2.6246254444122314, + "epoch": 2.97, + "learning_rate": 3.903916596224289e-05, + "loss": 76.7156, + "step": 3517, + "task_loss": 1.2652907371520996 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6911428861241957, + "compression/movement_sparsity/importance_threshold": -0.002163153322730367, + "compression/movement_sparsity/linear_layer_sparsity": 0.6766816677465193, + "compression/movement_sparsity/model_sparsity": 0.653435570743126, + "compression_loss": 73.93456268310547, + "distillation_loss": 2.761120557785034, + "epoch": 2.97, + "learning_rate": 3.9034469803700574e-05, + "loss": 76.4856, + "step": 3518, + "task_loss": 1.4737898111343384 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6915286360136617, + "compression/movement_sparsity/importance_threshold": -0.002160451632797861, + "compression/movement_sparsity/linear_layer_sparsity": 0.6769343885553938, + "compression/movement_sparsity/model_sparsity": 0.6536796098147554, + "compression_loss": 73.9753646850586, + "distillation_loss": 2.7445590496063232, + "epoch": 2.97, + "learning_rate": 3.902977364515826e-05, + "loss": 77.0944, + "step": 3519, + "task_loss": 1.4786510467529297 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6919140645787898, + "compression/movement_sparsity/importance_threshold": -0.002157752193335808, + "compression/movement_sparsity/linear_layer_sparsity": 0.6774305293223868, + "compression/movement_sparsity/model_sparsity": 0.6541587066200947, + "compression_loss": 74.01618194580078, + "distillation_loss": 2.495396614074707, + "epoch": 2.97, + "learning_rate": 3.902507748661595e-05, + "loss": 77.1416, + "step": 3520, + "task_loss": 1.097321629524231 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.692299171953466, + "compression/movement_sparsity/importance_threshold": -0.002155055003406507, + "compression/movement_sparsity/linear_layer_sparsity": 0.6778166815671065, + "compression/movement_sparsity/model_sparsity": 0.6545315933472651, + "compression_loss": 74.05694580078125, + "distillation_loss": 1.8660991191864014, + "epoch": 2.98, + "learning_rate": 3.902038132807363e-05, + "loss": 76.3922, + "step": 3521, + "task_loss": 1.5311673879623413 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6926839582715747, + "compression/movement_sparsity/importance_threshold": -0.0021523600620722673, + "compression/movement_sparsity/linear_layer_sparsity": 0.6782045389677983, + "compression/movement_sparsity/model_sparsity": 0.6549061266530541, + "compression_loss": 74.0976791381836, + "distillation_loss": 2.4687018394470215, + "epoch": 2.98, + "learning_rate": 3.9015685169531326e-05, + "loss": 76.2609, + "step": 3522, + "task_loss": 1.1037546396255493 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6930684236670016, + "compression/movement_sparsity/importance_threshold": -0.002149667368395388, + "compression/movement_sparsity/linear_layer_sparsity": 0.6787277356711571, + "compression/movement_sparsity/model_sparsity": 0.6554113499401112, + "compression_loss": 74.1384048461914, + "distillation_loss": 3.080456018447876, + "epoch": 2.98, + "learning_rate": 3.901098901098901e-05, + "loss": 77.0713, + "step": 3523, + "task_loss": 1.8184940814971924 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6934525682736313, + "compression/movement_sparsity/importance_threshold": -0.0021469769214381767, + "compression/movement_sparsity/linear_layer_sparsity": 0.6791918600480478, + "compression/movement_sparsity/model_sparsity": 0.6558595302168421, + "compression_loss": 74.17906951904297, + "distillation_loss": 5.344025611877441, + "epoch": 2.98, + "learning_rate": 3.9006292852446705e-05, + "loss": 77.215, + "step": 3524, + "task_loss": 2.8443946838378906 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6938363922253495, + "compression/movement_sparsity/importance_threshold": -0.0021442887202629355, + "compression/movement_sparsity/linear_layer_sparsity": 0.6795479753144927, + "compression/movement_sparsity/model_sparsity": 0.6562034118283461, + "compression_loss": 74.21961212158203, + "distillation_loss": 2.9555575847625732, + "epoch": 2.98, + "learning_rate": 3.9001596693904385e-05, + "loss": 77.0161, + "step": 3525, + "task_loss": 1.3013581037521362 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6942198956560408, + "compression/movement_sparsity/importance_threshold": -0.00214160276393197, + "compression/movement_sparsity/linear_layer_sparsity": 0.6798631310651087, + "compression/movement_sparsity/model_sparsity": 0.6565077410093958, + "compression_loss": 74.26028442382812, + "distillation_loss": 3.1672213077545166, + "epoch": 2.98, + "learning_rate": 3.899690053536207e-05, + "loss": 77.5017, + "step": 3526, + "task_loss": 1.9107997417449951 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6946030786995907, + "compression/movement_sparsity/importance_threshold": -0.0021389190515075825, + "compression/movement_sparsity/linear_layer_sparsity": 0.680318884676319, + "compression/movement_sparsity/model_sparsity": 0.6569478380819989, + "compression_loss": 74.30083465576172, + "distillation_loss": 4.137566566467285, + "epoch": 2.98, + "learning_rate": 3.8992204376819764e-05, + "loss": 77.5431, + "step": 3527, + "task_loss": 2.018787145614624 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6949859414898842, + "compression/movement_sparsity/importance_threshold": -0.0021362375820520784, + "compression/movement_sparsity/linear_layer_sparsity": 0.6807098185122608, + "compression/movement_sparsity/model_sparsity": 0.657325342138023, + "compression_loss": 74.34141540527344, + "distillation_loss": 2.014414072036743, + "epoch": 2.98, + "learning_rate": 3.898750821827745e-05, + "loss": 77.1238, + "step": 3528, + "task_loss": 1.3450452089309692 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6953684841608067, + "compression/movement_sparsity/importance_threshold": -0.0021335583546277593, + "compression/movement_sparsity/linear_layer_sparsity": 0.6812058877342478, + "compression/movement_sparsity/model_sparsity": 0.6578043698561474, + "compression_loss": 74.3819351196289, + "distillation_loss": 2.682356834411621, + "epoch": 2.98, + "learning_rate": 3.8982812059735144e-05, + "loss": 77.3362, + "step": 3529, + "task_loss": 0.8740662336349487 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.695750706846243, + "compression/movement_sparsity/importance_threshold": -0.0021308813682969314, + "compression/movement_sparsity/linear_layer_sparsity": 0.681595891485114, + "compression/movement_sparsity/model_sparsity": 0.6581809757783794, + "compression_loss": 74.42241668701172, + "distillation_loss": 3.363556385040283, + "epoch": 2.98, + "learning_rate": 3.897811590119282e-05, + "loss": 77.653, + "step": 3530, + "task_loss": 2.9099674224853516 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6961326096800781, + "compression/movement_sparsity/importance_threshold": -0.0021282066221219005, + "compression/movement_sparsity/linear_layer_sparsity": 0.6821610016377129, + "compression/movement_sparsity/model_sparsity": 0.6587266726587543, + "compression_loss": 74.46282958984375, + "distillation_loss": 2.7154297828674316, + "epoch": 2.98, + "learning_rate": 3.8973419742650516e-05, + "loss": 77.5275, + "step": 3531, + "task_loss": 1.8898917436599731 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6965141927961978, + "compression/movement_sparsity/importance_threshold": -0.0021255341151649645, + "compression/movement_sparsity/linear_layer_sparsity": 0.6825992863433367, + "compression/movement_sparsity/model_sparsity": 0.6591499009364185, + "compression_loss": 74.5032730102539, + "distillation_loss": 3.3017425537109375, + "epoch": 2.99, + "learning_rate": 3.89687235841082e-05, + "loss": 77.5874, + "step": 3532, + "task_loss": 1.4407390356063843 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6968954563284865, + "compression/movement_sparsity/importance_threshold": -0.002122863846488434, + "compression/movement_sparsity/linear_layer_sparsity": 0.6830630410710307, + "compression/movement_sparsity/model_sparsity": 0.6595977242625397, + "compression_loss": 74.5436019897461, + "distillation_loss": 2.367194175720215, + "epoch": 2.99, + "learning_rate": 3.896402742556589e-05, + "loss": 77.7123, + "step": 3533, + "task_loss": 1.6369836330413818 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6972764004108299, + "compression/movement_sparsity/importance_threshold": -0.002120195815154608, + "compression/movement_sparsity/linear_layer_sparsity": 0.6834213980809912, + "compression/movement_sparsity/model_sparsity": 0.6599437706067731, + "compression_loss": 74.58390808105469, + "distillation_loss": 4.44533109664917, + "epoch": 2.99, + "learning_rate": 3.8959331267023575e-05, + "loss": 78.5384, + "step": 3534, + "task_loss": 2.429835796356201 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6976570251771128, + "compression/movement_sparsity/importance_threshold": -0.0021175300202257935, + "compression/movement_sparsity/linear_layer_sparsity": 0.6838346778070826, + "compression/movement_sparsity/model_sparsity": 0.6603428529028761, + "compression_loss": 74.62417602539062, + "distillation_loss": 3.7314069271087646, + "epoch": 2.99, + "learning_rate": 3.895463510848126e-05, + "loss": 77.8136, + "step": 3535, + "task_loss": 2.0037572383880615 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6980373307612205, + "compression/movement_sparsity/importance_threshold": -0.002114866460764294, + "compression/movement_sparsity/linear_layer_sparsity": 0.6842380604740362, + "compression/movement_sparsity/model_sparsity": 0.6607323781342695, + "compression_loss": 74.66445922851562, + "distillation_loss": 2.8498382568359375, + "epoch": 2.99, + "learning_rate": 3.8949938949938955e-05, + "loss": 77.5993, + "step": 3536, + "task_loss": 1.7319574356079102 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6984173172970382, + "compression/movement_sparsity/importance_threshold": -0.002112205135832412, + "compression/movement_sparsity/linear_layer_sparsity": 0.6846356479955188, + "compression/movement_sparsity/model_sparsity": 0.661116307301267, + "compression_loss": 74.70462799072266, + "distillation_loss": 3.473423480987549, + "epoch": 2.99, + "learning_rate": 3.894524279139664e-05, + "loss": 77.6012, + "step": 3537, + "task_loss": 2.665347099304199 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6987969849184509, + "compression/movement_sparsity/importance_threshold": -0.0021095460444924526, + "compression/movement_sparsity/linear_layer_sparsity": 0.6851353302402996, + "compression/movement_sparsity/model_sparsity": 0.6615988239237373, + "compression_loss": 74.74482727050781, + "distillation_loss": 2.902806520462036, + "epoch": 2.99, + "learning_rate": 3.894054663285433e-05, + "loss": 77.6592, + "step": 3538, + "task_loss": 1.6976743936538696 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6991763337593437, + "compression/movement_sparsity/importance_threshold": -0.0021068891858067196, + "compression/movement_sparsity/linear_layer_sparsity": 0.685689649021188, + "compression/movement_sparsity/model_sparsity": 0.662134100149218, + "compression_loss": 74.78495788574219, + "distillation_loss": 3.601815700531006, + "epoch": 2.99, + "learning_rate": 3.8935850474312014e-05, + "loss": 77.7685, + "step": 3539, + "task_loss": 2.4926445484161377 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6995553639536018, + "compression/movement_sparsity/importance_threshold": -0.002104234558837518, + "compression/movement_sparsity/linear_layer_sparsity": 0.6861877095791704, + "compression/movement_sparsity/model_sparsity": 0.6626150507948202, + "compression_loss": 74.8250732421875, + "distillation_loss": 4.3156538009643555, + "epoch": 2.99, + "learning_rate": 3.89311543157697e-05, + "loss": 78.3369, + "step": 3540, + "task_loss": 2.5373926162719727 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.6999340756351107, + "compression/movement_sparsity/importance_threshold": -0.0021015821626471486, + "compression/movement_sparsity/linear_layer_sparsity": 0.6866432604795308, + "compression/movement_sparsity/model_sparsity": 0.6630549521203148, + "compression_loss": 74.86515045166016, + "distillation_loss": 3.8798952102661133, + "epoch": 2.99, + "learning_rate": 3.892645815722739e-05, + "loss": 78.1098, + "step": 3541, + "task_loss": 2.56223201751709 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7003124689377547, + "compression/movement_sparsity/importance_threshold": -0.002098931996297921, + "compression/movement_sparsity/linear_layer_sparsity": 0.6870088912317494, + "compression/movement_sparsity/model_sparsity": 0.6634080223313829, + "compression_loss": 74.9051742553711, + "distillation_loss": 3.5563862323760986, + "epoch": 2.99, + "learning_rate": 3.892176199868508e-05, + "loss": 78.3024, + "step": 3542, + "task_loss": 2.727679491043091 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7006905439954199, + "compression/movement_sparsity/importance_threshold": -0.002096284058852132, + "compression/movement_sparsity/linear_layer_sparsity": 0.6875650582586212, + "compression/movement_sparsity/model_sparsity": 0.6639450833099118, + "compression_loss": 74.94525909423828, + "distillation_loss": 4.539097785949707, + "epoch": 2.99, + "learning_rate": 3.8917065840142766e-05, + "loss": 77.5837, + "step": 3543, + "task_loss": 2.4861903190612793 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7010683009419907, + "compression/movement_sparsity/importance_threshold": -0.0020936383493720912, + "compression/movement_sparsity/linear_layer_sparsity": 0.6878605629809732, + "compression/movement_sparsity/model_sparsity": 0.6642304365359721, + "compression_loss": 74.9852294921875, + "distillation_loss": 2.2521543502807617, + "epoch": 3.0, + "learning_rate": 3.891236968160045e-05, + "loss": 77.7262, + "step": 3544, + "task_loss": 1.2192625999450684 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7014457399113527, + "compression/movement_sparsity/importance_threshold": -0.0020909948669200996, + "compression/movement_sparsity/linear_layer_sparsity": 0.6883322472801451, + "compression/movement_sparsity/model_sparsity": 0.6646859170283967, + "compression_loss": 75.02513122558594, + "distillation_loss": 3.061636209487915, + "epoch": 3.0, + "learning_rate": 3.890767352305814e-05, + "loss": 77.8963, + "step": 3545, + "task_loss": 1.07033371925354 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7018228610373907, + "compression/movement_sparsity/importance_threshold": -0.002088353610558465, + "compression/movement_sparsity/linear_layer_sparsity": 0.6886782389282673, + "compression/movement_sparsity/model_sparsity": 0.6650200227990112, + "compression_loss": 75.06502532958984, + "distillation_loss": 3.494990825653076, + "epoch": 3.0, + "learning_rate": 3.890297736451583e-05, + "loss": 77.8407, + "step": 3546, + "task_loss": 1.8421080112457275 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7021996644539901, + "compression/movement_sparsity/importance_threshold": -0.0020857145793494866, + "compression/movement_sparsity/linear_layer_sparsity": 0.6890669667931963, + "compression/movement_sparsity/model_sparsity": 0.6653953966659132, + "compression_loss": 75.10491943359375, + "distillation_loss": 3.1889877319335938, + "epoch": 3.0, + "learning_rate": 3.889828120597351e-05, + "loss": 78.1745, + "step": 3547, + "task_loss": 1.9380377531051636 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7025761502950361, + "compression/movement_sparsity/importance_threshold": -0.0020830777723554695, + "compression/movement_sparsity/linear_layer_sparsity": 0.6895915347758332, + "compression/movement_sparsity/model_sparsity": 0.6659019441245866, + "compression_loss": 75.144775390625, + "distillation_loss": 2.3045902252197266, + "epoch": 3.0, + "learning_rate": 3.8893585047431204e-05, + "loss": 77.965, + "step": 3548, + "task_loss": 1.477068543434143 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7029523186944135, + "compression/movement_sparsity/importance_threshold": -0.0020804431886387196, + "compression/movement_sparsity/linear_layer_sparsity": 0.6900735692525131, + "compression/movement_sparsity/model_sparsity": 0.6663674192340809, + "compression_loss": 75.1845474243164, + "distillation_loss": 3.7252001762390137, + "epoch": 3.0, + "learning_rate": 3.888888888888889e-05, + "loss": 78.5459, + "step": 3549, + "task_loss": 2.7672505378723145 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7037037037037036, + "compression/movement_sparsity/importance_threshold": -0.002075180687286236, + "compression/movement_sparsity/linear_layer_sparsity": 0.6908862370494, + "compression/movement_sparsity/model_sparsity": 0.6671521693920863, + "compression_loss": 75.26439666748047, + "distillation_loss": 4.037415504455566, + "epoch": 3.0, + "learning_rate": 3.888419273034658e-05, + "loss": 137.5458, + "step": 3550, + "task_loss": 2.530160665512085 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7040789205813869, + "compression/movement_sparsity/importance_threshold": -0.0020725527677751075, + "compression/movement_sparsity/linear_layer_sparsity": 0.691168804049867, + "compression/movement_sparsity/model_sparsity": 0.6674250293468095, + "compression_loss": 75.30401611328125, + "distillation_loss": 2.2475407123565674, + "epoch": 3.0, + "learning_rate": 3.8879496571804263e-05, + "loss": 78.3594, + "step": 3551, + "task_loss": 1.0853458642959595 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7044538205529419, + "compression/movement_sparsity/importance_threshold": -0.0020699270677904643, + "compression/movement_sparsity/linear_layer_sparsity": 0.6916943021175795, + "compression/movement_sparsity/model_sparsity": 0.6679324749392749, + "compression_loss": 75.34367370605469, + "distillation_loss": 2.9350528717041016, + "epoch": 3.0, + "learning_rate": 3.887480041326195e-05, + "loss": 78.3286, + "step": 3552, + "task_loss": 1.3534562587738037 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7048284037522546, + "compression/movement_sparsity/importance_threshold": -0.0020673035863946034, + "compression/movement_sparsity/linear_layer_sparsity": 0.6921543961258093, + "compression/movement_sparsity/model_sparsity": 0.6683767633029073, + "compression_loss": 75.38325500488281, + "distillation_loss": 2.7520534992218018, + "epoch": 3.0, + "learning_rate": 3.887010425471964e-05, + "loss": 78.2598, + "step": 3553, + "task_loss": 1.9941890239715576 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7052026703132095, + "compression/movement_sparsity/importance_threshold": -0.0020646823226498356, + "compression/movement_sparsity/linear_layer_sparsity": 0.6925586134844974, + "compression/movement_sparsity/model_sparsity": 0.6687670945518064, + "compression_loss": 75.42284393310547, + "distillation_loss": 4.449623107910156, + "epoch": 3.0, + "learning_rate": 3.886540809617733e-05, + "loss": 79.0145, + "step": 3554, + "task_loss": 2.9505841732025146 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7055766203696922, + "compression/movement_sparsity/importance_threshold": -0.0020620632756184597, + "compression/movement_sparsity/linear_layer_sparsity": 0.6928542374485257, + "compression/movement_sparsity/model_sparsity": 0.6690525629232247, + "compression_loss": 75.46237182617188, + "distillation_loss": 2.5959811210632324, + "epoch": 3.01, + "learning_rate": 3.8860711937635016e-05, + "loss": 78.5603, + "step": 3555, + "task_loss": 1.5232499837875366 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7059502540555873, + "compression/movement_sparsity/importance_threshold": -0.002059446444362784, + "compression/movement_sparsity/linear_layer_sparsity": 0.6933199477397121, + "compression/movement_sparsity/model_sparsity": 0.6695022746332162, + "compression_loss": 75.50188446044922, + "distillation_loss": 2.5966243743896484, + "epoch": 3.01, + "learning_rate": 3.88560157790927e-05, + "loss": 78.023, + "step": 3556, + "task_loss": 2.0346181392669678 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7063235715047802, + "compression/movement_sparsity/importance_threshold": -0.002056831827945111, + "compression/movement_sparsity/linear_layer_sparsity": 0.6938405092020232, + "compression/movement_sparsity/model_sparsity": 0.6700049532078627, + "compression_loss": 75.54131317138672, + "distillation_loss": 2.037461996078491, + "epoch": 3.01, + "learning_rate": 3.885131962055039e-05, + "loss": 77.7502, + "step": 3557, + "task_loss": 1.2248094081878662 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7066965728511565, + "compression/movement_sparsity/importance_threshold": -0.002054219425427741, + "compression/movement_sparsity/linear_layer_sparsity": 0.6942760394249231, + "compression/movement_sparsity/model_sparsity": 0.6704255216277584, + "compression_loss": 75.58076477050781, + "distillation_loss": 5.4873504638671875, + "epoch": 3.01, + "learning_rate": 3.884662346200808e-05, + "loss": 80.0207, + "step": 3558, + "task_loss": 4.12951135635376 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7070692582286007, + "compression/movement_sparsity/importance_threshold": -0.002051609235872983, + "compression/movement_sparsity/linear_layer_sparsity": 0.6946422663855234, + "compression/movement_sparsity/model_sparsity": 0.6707791675656161, + "compression_loss": 75.62012481689453, + "distillation_loss": 2.0779519081115723, + "epoch": 3.01, + "learning_rate": 3.884192730346577e-05, + "loss": 78.8415, + "step": 3559, + "task_loss": 0.9649397134780884 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7074416277709983, + "compression/movement_sparsity/importance_threshold": -0.0020490012583431376, + "compression/movement_sparsity/linear_layer_sparsity": 0.6950915213253722, + "compression/movement_sparsity/model_sparsity": 0.6712129892162112, + "compression_loss": 75.65950775146484, + "distillation_loss": 4.026408672332764, + "epoch": 3.01, + "learning_rate": 3.8837231144923454e-05, + "loss": 78.7413, + "step": 3560, + "task_loss": 2.7202627658843994 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7078136816122341, + "compression/movement_sparsity/importance_threshold": -0.002046395491900511, + "compression/movement_sparsity/linear_layer_sparsity": 0.6954925310829663, + "compression/movement_sparsity/model_sparsity": 0.6716002230549816, + "compression_loss": 75.69883728027344, + "distillation_loss": 3.7912116050720215, + "epoch": 3.01, + "learning_rate": 3.883253498638114e-05, + "loss": 78.7645, + "step": 3561, + "task_loss": 2.6888856887817383 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7081854198861938, + "compression/movement_sparsity/importance_threshold": -0.0020437919356074054, + "compression/movement_sparsity/linear_layer_sparsity": 0.6959806588093079, + "compression/movement_sparsity/model_sparsity": 0.6720715820922669, + "compression_loss": 75.73812103271484, + "distillation_loss": 3.135347843170166, + "epoch": 3.01, + "learning_rate": 3.8827838827838833e-05, + "loss": 78.8768, + "step": 3562, + "task_loss": 2.4276068210601807 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7085568427267619, + "compression/movement_sparsity/importance_threshold": -0.002041190588526127, + "compression/movement_sparsity/linear_layer_sparsity": 0.6963649151113737, + "compression/movement_sparsity/model_sparsity": 0.672442638008246, + "compression_loss": 75.77738189697266, + "distillation_loss": 2.9291934967041016, + "epoch": 3.01, + "learning_rate": 3.882314266929652e-05, + "loss": 79.1526, + "step": 3563, + "task_loss": 2.239157199859619 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7089279502678241, + "compression/movement_sparsity/importance_threshold": -0.0020385914497189767, + "compression/movement_sparsity/linear_layer_sparsity": 0.696942486019152, + "compression/movement_sparsity/model_sparsity": 0.6730003675785262, + "compression_loss": 75.8166275024414, + "distillation_loss": 2.756098747253418, + "epoch": 3.01, + "learning_rate": 3.8818446510754206e-05, + "loss": 78.8098, + "step": 3564, + "task_loss": 1.2386481761932373 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.709298742643265, + "compression/movement_sparsity/importance_threshold": -0.002035994518248262, + "compression/movement_sparsity/linear_layer_sparsity": 0.6972778293855753, + "compression/movement_sparsity/model_sparsity": 0.673324190868676, + "compression_loss": 75.85584259033203, + "distillation_loss": 2.5799355506896973, + "epoch": 3.01, + "learning_rate": 3.881375035221189e-05, + "loss": 78.6244, + "step": 3565, + "task_loss": 1.2727571725845337 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7096692199869703, + "compression/movement_sparsity/importance_threshold": -0.002033399793176284, + "compression/movement_sparsity/linear_layer_sparsity": 0.6975952030313715, + "compression/movement_sparsity/model_sparsity": 0.6736306617533835, + "compression_loss": 75.89501190185547, + "distillation_loss": 3.2921628952026367, + "epoch": 3.01, + "learning_rate": 3.880905419366958e-05, + "loss": 79.2981, + "step": 3566, + "task_loss": 2.56274151802063 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7100393824328246, + "compression/movement_sparsity/importance_threshold": -0.002030807273565349, + "compression/movement_sparsity/linear_layer_sparsity": 0.6979746896663828, + "compression/movement_sparsity/model_sparsity": 0.6739971118550449, + "compression_loss": 75.93416595458984, + "distillation_loss": 3.6278750896453857, + "epoch": 3.02, + "learning_rate": 3.880435803512727e-05, + "loss": 79.2073, + "step": 3567, + "task_loss": 1.7580965757369995 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7104092301147134, + "compression/movement_sparsity/importance_threshold": -0.00202821695847776, + "compression/movement_sparsity/linear_layer_sparsity": 0.6984193538016917, + "compression/movement_sparsity/model_sparsity": 0.674426500409359, + "compression_loss": 75.97332763671875, + "distillation_loss": 2.698019504547119, + "epoch": 3.02, + "learning_rate": 3.879966187658495e-05, + "loss": 78.838, + "step": 3568, + "task_loss": 3.2207140922546387 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7107787631665219, + "compression/movement_sparsity/importance_threshold": -0.002025628846975819, + "compression/movement_sparsity/linear_layer_sparsity": 0.6988625631885491, + "compression/movement_sparsity/model_sparsity": 0.6748544841903064, + "compression_loss": 76.01234436035156, + "distillation_loss": 3.7371256351470947, + "epoch": 3.02, + "learning_rate": 3.8794965718042645e-05, + "loss": 79.2318, + "step": 3569, + "task_loss": 1.8398133516311646 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7111479817221347, + "compression/movement_sparsity/importance_threshold": -0.0020230429381218346, + "compression/movement_sparsity/linear_layer_sparsity": 0.6993459093236688, + "compression/movement_sparsity/model_sparsity": 0.675321225898738, + "compression_loss": 76.05135345458984, + "distillation_loss": 3.2853474617004395, + "epoch": 3.02, + "learning_rate": 3.879026955950033e-05, + "loss": 78.8324, + "step": 3570, + "task_loss": 2.6696784496307373 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7115168859154377, + "compression/movement_sparsity/importance_threshold": -0.002020459230978105, + "compression/movement_sparsity/linear_layer_sparsity": 0.6998391286695911, + "compression/movement_sparsity/model_sparsity": 0.6757975016428076, + "compression_loss": 76.09036254882812, + "distillation_loss": 3.316774606704712, + "epoch": 3.02, + "learning_rate": 3.878557340095802e-05, + "loss": 79.1839, + "step": 3571, + "task_loss": 2.796229600906372 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7118854758803155, + "compression/movement_sparsity/importance_threshold": -0.002017877724606938, + "compression/movement_sparsity/linear_layer_sparsity": 0.7000861735746708, + "compression/movement_sparsity/model_sparsity": 0.6760360597953989, + "compression_loss": 76.1292953491211, + "distillation_loss": 2.4558284282684326, + "epoch": 3.02, + "learning_rate": 3.878087724241571e-05, + "loss": 78.8733, + "step": 3572, + "task_loss": 2.29618501663208 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7122537517506536, + "compression/movement_sparsity/importance_threshold": -0.002015298418070636, + "compression/movement_sparsity/linear_layer_sparsity": 0.7005630568109319, + "compression/movement_sparsity/model_sparsity": 0.67649656062543, + "compression_loss": 76.16820526123047, + "distillation_loss": 3.788773775100708, + "epoch": 3.02, + "learning_rate": 3.877618108387339e-05, + "loss": 78.9283, + "step": 3573, + "task_loss": 3.48488450050354 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7126217136603368, + "compression/movement_sparsity/importance_threshold": -0.0020127213104315046, + "compression/movement_sparsity/linear_layer_sparsity": 0.701031557357345, + "compression/movement_sparsity/model_sparsity": 0.6769489667367975, + "compression_loss": 76.2071304321289, + "distillation_loss": 2.7483935356140137, + "epoch": 3.02, + "learning_rate": 3.877148492533108e-05, + "loss": 78.5841, + "step": 3574, + "task_loss": 1.562520146369934 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7129893617432504, + "compression/movement_sparsity/importance_threshold": -0.0020101464007518466, + "compression/movement_sparsity/linear_layer_sparsity": 0.7013984282229976, + "compression/movement_sparsity/model_sparsity": 0.6773032344595881, + "compression_loss": 76.24596405029297, + "distillation_loss": 2.970107316970825, + "epoch": 3.02, + "learning_rate": 3.876678876678877e-05, + "loss": 79.2099, + "step": 3575, + "task_loss": 2.3123908042907715 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7133566961332795, + "compression/movement_sparsity/importance_threshold": -0.0020075736880939676, + "compression/movement_sparsity/linear_layer_sparsity": 0.7019223403764145, + "compression/movement_sparsity/model_sparsity": 0.6778091486187928, + "compression_loss": 76.28477478027344, + "distillation_loss": 4.013206481933594, + "epoch": 3.02, + "learning_rate": 3.8762092608246456e-05, + "loss": 79.3435, + "step": 3576, + "task_loss": 3.422053098678589 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7137237169643091, + "compression/movement_sparsity/importance_threshold": -0.0020050031715201696, + "compression/movement_sparsity/linear_layer_sparsity": 0.7024589875408666, + "compression/movement_sparsity/model_sparsity": 0.6783273603022262, + "compression_loss": 76.32354736328125, + "distillation_loss": 2.5563859939575195, + "epoch": 3.02, + "learning_rate": 3.875739644970414e-05, + "loss": 79.1602, + "step": 3577, + "task_loss": 0.7791175842285156 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7140904243702249, + "compression/movement_sparsity/importance_threshold": -0.0020024348500927557, + "compression/movement_sparsity/linear_layer_sparsity": 0.7028259418756926, + "compression/movement_sparsity/model_sparsity": 0.6786817086267674, + "compression_loss": 76.36231994628906, + "distillation_loss": 3.432677745819092, + "epoch": 3.02, + "learning_rate": 3.875270029116183e-05, + "loss": 79.0639, + "step": 3578, + "task_loss": 1.0367989540100098 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7144568184849114, + "compression/movement_sparsity/importance_threshold": -0.0019998687228740325, + "compression/movement_sparsity/linear_layer_sparsity": 0.7033268284614046, + "compression/movement_sparsity/model_sparsity": 0.6791653882173528, + "compression_loss": 76.40101623535156, + "distillation_loss": 3.6220920085906982, + "epoch": 3.03, + "learning_rate": 3.874800413261952e-05, + "loss": 79.4855, + "step": 3579, + "task_loss": 3.0541529655456543 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7148228994422541, + "compression/movement_sparsity/importance_threshold": -0.001997304788926302, + "compression/movement_sparsity/linear_layer_sparsity": 0.7038418451857652, + "compression/movement_sparsity/model_sparsity": 0.6796627125328548, + "compression_loss": 76.4397201538086, + "distillation_loss": 3.091390609741211, + "epoch": 3.03, + "learning_rate": 3.874330797407721e-05, + "loss": 79.6463, + "step": 3580, + "task_loss": 1.9463034868240356 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7151886673761378, + "compression/movement_sparsity/importance_threshold": -0.0019947430473118708, + "compression/movement_sparsity/linear_layer_sparsity": 0.7041304577392236, + "compression/movement_sparsity/model_sparsity": 0.6799414103572259, + "compression_loss": 76.47840881347656, + "distillation_loss": 3.544201374053955, + "epoch": 3.03, + "learning_rate": 3.8738611815534894e-05, + "loss": 79.5556, + "step": 3581, + "task_loss": 2.1705336570739746 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7155541224204482, + "compression/movement_sparsity/importance_threshold": -0.0019921834970930384, + "compression/movement_sparsity/linear_layer_sparsity": 0.7045039465179141, + "compression/movement_sparsity/model_sparsity": 0.6803020686473825, + "compression_loss": 76.5170669555664, + "distillation_loss": 3.277132511138916, + "epoch": 3.03, + "learning_rate": 3.873391565699258e-05, + "loss": 79.3831, + "step": 3582, + "task_loss": 1.6507868766784668 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.71591926470907, + "compression/movement_sparsity/importance_threshold": -0.001989626137332113, + "compression/movement_sparsity/linear_layer_sparsity": 0.7049047535646583, + "compression/movement_sparsity/model_sparsity": 0.6806891067390445, + "compression_loss": 76.55569458007812, + "distillation_loss": 4.039057731628418, + "epoch": 3.03, + "learning_rate": 3.872921949845027e-05, + "loss": 80.0292, + "step": 3583, + "task_loss": 2.9238855838775635 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7162840943758886, + "compression/movement_sparsity/importance_threshold": -0.001987070967091396, + "compression/movement_sparsity/linear_layer_sparsity": 0.7052450573848182, + "compression/movement_sparsity/model_sparsity": 0.6810177200760849, + "compression_loss": 76.59428405761719, + "distillation_loss": 5.368074893951416, + "epoch": 3.03, + "learning_rate": 3.872452333990796e-05, + "loss": 79.6794, + "step": 3584, + "task_loss": 2.2665061950683594 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7166486115547885, + "compression/movement_sparsity/importance_threshold": -0.001984517985433195, + "compression/movement_sparsity/linear_layer_sparsity": 0.7057160977789377, + "compression/movement_sparsity/model_sparsity": 0.6814725787835766, + "compression_loss": 76.63282775878906, + "distillation_loss": 3.9251279830932617, + "epoch": 3.03, + "learning_rate": 3.871982718136564e-05, + "loss": 80.2899, + "step": 3585, + "task_loss": 2.7647080421447754 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7170128163796559, + "compression/movement_sparsity/importance_threshold": -0.001981967191419809, + "compression/movement_sparsity/linear_layer_sparsity": 0.7061616919993223, + "compression/movement_sparsity/model_sparsity": 0.6819028654716828, + "compression_loss": 76.67135620117188, + "distillation_loss": 4.1690239906311035, + "epoch": 3.03, + "learning_rate": 3.871513102282333e-05, + "loss": 79.3071, + "step": 3586, + "task_loss": 2.2057766914367676 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.717376708984375, + "compression/movement_sparsity/importance_threshold": -0.001979418584113546, + "compression/movement_sparsity/linear_layer_sparsity": 0.7067076877112008, + "compression/movement_sparsity/model_sparsity": 0.682430104551179, + "compression_loss": 76.70980072021484, + "distillation_loss": 1.8608615398406982, + "epoch": 3.03, + "learning_rate": 3.871043486428102e-05, + "loss": 79.5247, + "step": 3587, + "task_loss": 2.71101975440979 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7177402895028313, + "compression/movement_sparsity/importance_threshold": -0.0019768721625767083, + "compression/movement_sparsity/linear_layer_sparsity": 0.7071375062578031, + "compression/movement_sparsity/model_sparsity": 0.682845157508429, + "compression_loss": 76.74826049804688, + "distillation_loss": 1.9667669534683228, + "epoch": 3.03, + "learning_rate": 3.870573870573871e-05, + "loss": 79.6373, + "step": 3588, + "task_loss": 1.2972990274429321 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7181035580689104, + "compression/movement_sparsity/importance_threshold": -0.001974327925871598, + "compression/movement_sparsity/linear_layer_sparsity": 0.7073807592775748, + "compression/movement_sparsity/model_sparsity": 0.6830800540386377, + "compression_loss": 76.7866439819336, + "distillation_loss": 2.801029920578003, + "epoch": 3.03, + "learning_rate": 3.87010425471964e-05, + "loss": 79.4876, + "step": 3589, + "task_loss": 1.4843177795410156 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7184665148164965, + "compression/movement_sparsity/importance_threshold": -0.0019717858730605234, + "compression/movement_sparsity/linear_layer_sparsity": 0.7078120564209639, + "compression/movement_sparsity/model_sparsity": 0.6834965347983262, + "compression_loss": 76.82508850097656, + "distillation_loss": 2.6059072017669678, + "epoch": 3.03, + "learning_rate": 3.869634638865408e-05, + "loss": 79.6378, + "step": 3590, + "task_loss": 1.5625333786010742 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7188291598794756, + "compression/movement_sparsity/importance_threshold": -0.0019692460032057837, + "compression/movement_sparsity/linear_layer_sparsity": 0.7081871430381177, + "compression/movement_sparsity/model_sparsity": 0.6838587360362792, + "compression_loss": 76.86338806152344, + "distillation_loss": 3.4056460857391357, + "epoch": 3.04, + "learning_rate": 3.869165023011177e-05, + "loss": 80.0982, + "step": 3591, + "task_loss": 2.65210223197937 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7191914933917322, + "compression/movement_sparsity/importance_threshold": -0.001966708315369688, + "compression/movement_sparsity/linear_layer_sparsity": 0.7086996556872747, + "compression/movement_sparsity/model_sparsity": 0.6843536422992644, + "compression_loss": 76.90164184570312, + "distillation_loss": 3.5913431644439697, + "epoch": 3.04, + "learning_rate": 3.868695407156946e-05, + "loss": 80.0666, + "step": 3592, + "task_loss": 2.770752191543579 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.719553515487152, + "compression/movement_sparsity/importance_threshold": -0.001964172808614535, + "compression/movement_sparsity/linear_layer_sparsity": 0.7092405955520756, + "compression/movement_sparsity/model_sparsity": 0.6848759992155837, + "compression_loss": 76.93987274169922, + "distillation_loss": 2.959197521209717, + "epoch": 3.04, + "learning_rate": 3.868225791302715e-05, + "loss": 79.9809, + "step": 3593, + "task_loss": 1.1539011001586914 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7199152262996196, + "compression/movement_sparsity/importance_threshold": -0.0019616394820026325, + "compression/movement_sparsity/linear_layer_sparsity": 0.7096841268914592, + "compression/movement_sparsity/model_sparsity": 0.6853042938889975, + "compression_loss": 76.97811889648438, + "distillation_loss": 4.976275444030762, + "epoch": 3.04, + "learning_rate": 3.867756175448483e-05, + "loss": 80.3089, + "step": 3594, + "task_loss": 3.1354384422302246 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7202766259630206, + "compression/movement_sparsity/importance_threshold": -0.0019591083345962823, + "compression/movement_sparsity/linear_layer_sparsity": 0.7101547141672085, + "compression/movement_sparsity/model_sparsity": 0.6857587150441291, + "compression_loss": 77.0163345336914, + "distillation_loss": 3.236546754837036, + "epoch": 3.04, + "learning_rate": 3.867286559594252e-05, + "loss": 80.0372, + "step": 3595, + "task_loss": 1.638723373413086 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7206377146112397, + "compression/movement_sparsity/importance_threshold": -0.0019565793654577916, + "compression/movement_sparsity/linear_layer_sparsity": 0.7106480289064718, + "compression/movement_sparsity/model_sparsity": 0.686235082904485, + "compression_loss": 77.05443572998047, + "distillation_loss": 3.1052932739257812, + "epoch": 3.04, + "learning_rate": 3.866816943740021e-05, + "loss": 80.1409, + "step": 3596, + "task_loss": 1.676505208015442 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7209984923781626, + "compression/movement_sparsity/importance_threshold": -0.001954052573649459, + "compression/movement_sparsity/linear_layer_sparsity": 0.7110364109705395, + "compression/movement_sparsity/model_sparsity": 0.686610122849849, + "compression_loss": 77.0925064086914, + "distillation_loss": 2.44527268409729, + "epoch": 3.04, + "learning_rate": 3.8663473278857896e-05, + "loss": 79.7299, + "step": 3597, + "task_loss": 3.414745330810547 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7213589593976738, + "compression/movement_sparsity/importance_threshold": -0.0019515279582335938, + "compression/movement_sparsity/linear_layer_sparsity": 0.7113079004192728, + "compression/movement_sparsity/model_sparsity": 0.686872285800819, + "compression_loss": 77.13054656982422, + "distillation_loss": 2.2238564491271973, + "epoch": 3.04, + "learning_rate": 3.865877712031558e-05, + "loss": 80.0305, + "step": 3598, + "task_loss": 2.557774782180786 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7217191158036588, + "compression/movement_sparsity/importance_threshold": -0.0019490055182724989, + "compression/movement_sparsity/linear_layer_sparsity": 0.7118512608901039, + "compression/movement_sparsity/model_sparsity": 0.6873969801679047, + "compression_loss": 77.16860961914062, + "distillation_loss": 3.12207293510437, + "epoch": 3.04, + "learning_rate": 3.865408096177327e-05, + "loss": 80.1855, + "step": 3599, + "task_loss": 1.5262266397476196 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.722078961730003, + "compression/movement_sparsity/importance_threshold": -0.0019464852528284738, + "compression/movement_sparsity/linear_layer_sparsity": 0.7122738295427835, + "compression/movement_sparsity/model_sparsity": 0.6878050322873916, + "compression_loss": 77.20663452148438, + "distillation_loss": 3.6043689250946045, + "epoch": 3.04, + "learning_rate": 3.864938480323096e-05, + "loss": 80.1713, + "step": 3600, + "task_loss": 2.2894561290740967 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7224384973105908, + "compression/movement_sparsity/importance_threshold": -0.0019439671609638286, + "compression/movement_sparsity/linear_layer_sparsity": 0.7125002694861886, + "compression/movement_sparsity/model_sparsity": 0.68802369332213, + "compression_loss": 77.24468231201172, + "distillation_loss": 3.0752198696136475, + "epoch": 3.04, + "learning_rate": 3.864468864468865e-05, + "loss": 80.2729, + "step": 3601, + "task_loss": 2.9824411869049072 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7227977226793083, + "compression/movement_sparsity/importance_threshold": -0.001941451241740862, + "compression/movement_sparsity/linear_layer_sparsity": 0.71290038493121, + "compression/movement_sparsity/model_sparsity": 0.6884100635707158, + "compression_loss": 77.28262329101562, + "distillation_loss": 3.17338228225708, + "epoch": 3.04, + "learning_rate": 3.8639992486146335e-05, + "loss": 80.1684, + "step": 3602, + "task_loss": 2.5938024520874023 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7231566379700398, + "compression/movement_sparsity/importance_threshold": -0.0019389374942218813, + "compression/movement_sparsity/linear_layer_sparsity": 0.7132567029085048, + "compression/movement_sparsity/model_sparsity": 0.6887541409293283, + "compression_loss": 77.320556640625, + "distillation_loss": 4.08018684387207, + "epoch": 3.05, + "learning_rate": 3.863529632760402e-05, + "loss": 80.3788, + "step": 3603, + "task_loss": 2.2018134593963623 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7235152433166709, + "compression/movement_sparsity/importance_threshold": -0.0019364259174691895, + "compression/movement_sparsity/linear_layer_sparsity": 0.7137803288818985, + "compression/movement_sparsity/model_sparsity": 0.6892597787396739, + "compression_loss": 77.3584213256836, + "distillation_loss": 3.6879143714904785, + "epoch": 3.05, + "learning_rate": 3.863060016906171e-05, + "loss": 80.3016, + "step": 3604, + "task_loss": 2.270658016204834 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7238735388530865, + "compression/movement_sparsity/importance_threshold": -0.0019339165105450922, + "compression/movement_sparsity/linear_layer_sparsity": 0.7141935251388164, + "compression/movement_sparsity/model_sparsity": 0.6896587804340264, + "compression_loss": 77.39633178710938, + "distillation_loss": 3.4445180892944336, + "epoch": 3.05, + "learning_rate": 3.86259040105194e-05, + "loss": 80.0609, + "step": 3605, + "task_loss": 1.3628276586532593 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7242315247131721, + "compression/movement_sparsity/importance_threshold": -0.0019314092725118892, + "compression/movement_sparsity/linear_layer_sparsity": 0.7147066339963553, + "compression/movement_sparsity/model_sparsity": 0.6901542624238013, + "compression_loss": 77.43413543701172, + "distillation_loss": 2.320568561553955, + "epoch": 3.05, + "learning_rate": 3.862120785197709e-05, + "loss": 80.0468, + "step": 3606, + "task_loss": 1.0154718160629272 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7245892010308121, + "compression/movement_sparsity/importance_threshold": -0.0019289042024318902, + "compression/movement_sparsity/linear_layer_sparsity": 0.714971755939571, + "compression/movement_sparsity/model_sparsity": 0.6904102766126571, + "compression_loss": 77.47197723388672, + "distillation_loss": 4.382578372955322, + "epoch": 3.05, + "learning_rate": 3.861651169343477e-05, + "loss": 80.7576, + "step": 3607, + "task_loss": 1.8668166399002075 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7249465679398923, + "compression/movement_sparsity/importance_threshold": -0.0019264012993673958, + "compression/movement_sparsity/linear_layer_sparsity": 0.7154117338769991, + "compression/movement_sparsity/model_sparsity": 0.690835139954404, + "compression_loss": 77.50968170166016, + "distillation_loss": 4.620975971221924, + "epoch": 3.05, + "learning_rate": 3.861181553489246e-05, + "loss": 81.0878, + "step": 3608, + "task_loss": 2.8663976192474365 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.725303625574298, + "compression/movement_sparsity/importance_threshold": -0.001923900562380709, + "compression/movement_sparsity/linear_layer_sparsity": 0.7157900280952468, + "compression/movement_sparsity/model_sparsity": 0.6912004386024858, + "compression_loss": 77.54743957519531, + "distillation_loss": 3.309844970703125, + "epoch": 3.05, + "learning_rate": 3.8607119376350146e-05, + "loss": 80.7841, + "step": 3609, + "task_loss": 1.6952966451644897 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7256603740679137, + "compression/movement_sparsity/importance_threshold": -0.0019214019905341353, + "compression/movement_sparsity/linear_layer_sparsity": 0.7162374228649444, + "compression/movement_sparsity/model_sparsity": 0.691632463985497, + "compression_loss": 77.58517456054688, + "distillation_loss": 4.154474258422852, + "epoch": 3.05, + "learning_rate": 3.860242321780784e-05, + "loss": 80.8587, + "step": 3610, + "task_loss": 1.9240983724594116 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7260168135546251, + "compression/movement_sparsity/importance_threshold": -0.0019189055828899779, + "compression/movement_sparsity/linear_layer_sparsity": 0.716595779874905, + "compression/movement_sparsity/model_sparsity": 0.6919785103297303, + "compression_loss": 77.62287139892578, + "distillation_loss": 2.423325777053833, + "epoch": 3.05, + "learning_rate": 3.859772705926552e-05, + "loss": 80.1838, + "step": 3611, + "task_loss": 1.3057798147201538 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7263729441683169, + "compression/movement_sparsity/importance_threshold": -0.0019164113385105423, + "compression/movement_sparsity/linear_layer_sparsity": 0.7169204153387914, + "compression/movement_sparsity/model_sparsity": 0.6922919935667368, + "compression_loss": 77.66051483154297, + "distillation_loss": 4.885219573974609, + "epoch": 3.05, + "learning_rate": 3.859303090072321e-05, + "loss": 81.002, + "step": 3612, + "task_loss": 2.5745415687561035 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7267287660428745, + "compression/movement_sparsity/importance_threshold": -0.0019139192564581308, + "compression/movement_sparsity/linear_layer_sparsity": 0.7173174185760598, + "compression/movement_sparsity/model_sparsity": 0.6926753585214802, + "compression_loss": 77.69811248779297, + "distillation_loss": 2.904564380645752, + "epoch": 3.05, + "learning_rate": 3.85883347421809e-05, + "loss": 81.0122, + "step": 3613, + "task_loss": 1.4313076734542847 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7270842793121828, + "compression/movement_sparsity/importance_threshold": -0.0019114293357950498, + "compression/movement_sparsity/linear_layer_sparsity": 0.7177054071425955, + "compression/movement_sparsity/model_sparsity": 0.693050018487163, + "compression_loss": 77.7357406616211, + "distillation_loss": 2.5375521183013916, + "epoch": 3.05, + "learning_rate": 3.8583638583638584e-05, + "loss": 80.1462, + "step": 3614, + "task_loss": 1.7872376441955566 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7274394841101275, + "compression/movement_sparsity/importance_threshold": -0.001908941575583599, + "compression/movement_sparsity/linear_layer_sparsity": 0.7180883517862212, + "compression/movement_sparsity/model_sparsity": 0.6934198078042046, + "compression_loss": 77.77330780029297, + "distillation_loss": 1.942220687866211, + "epoch": 3.06, + "learning_rate": 3.857894242509627e-05, + "loss": 80.648, + "step": 3615, + "task_loss": 1.4977554082870483 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7277943805705929, + "compression/movement_sparsity/importance_threshold": -0.0019064559748860873, + "compression/movement_sparsity/linear_layer_sparsity": 0.7183394747566325, + "compression/movement_sparsity/model_sparsity": 0.6936623039280376, + "compression_loss": 77.81082153320312, + "distillation_loss": 2.5870237350463867, + "epoch": 3.06, + "learning_rate": 3.857424626655396e-05, + "loss": 80.8943, + "step": 3616, + "task_loss": 1.488090991973877 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7281489688274649, + "compression/movement_sparsity/importance_threshold": -0.0019039725327648143, + "compression/movement_sparsity/linear_layer_sparsity": 0.718707299555696, + "compression/movement_sparsity/model_sparsity": 0.6940174928136918, + "compression_loss": 77.84829711914062, + "distillation_loss": 3.1270554065704346, + "epoch": 3.06, + "learning_rate": 3.856955010801165e-05, + "loss": 80.6975, + "step": 3617, + "task_loss": 2.2518465518951416 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7285032490146282, + "compression/movement_sparsity/importance_threshold": -0.0019014912482820884, + "compression/movement_sparsity/linear_layer_sparsity": 0.7192190967547949, + "compression/movement_sparsity/model_sparsity": 0.6945117082045293, + "compression_loss": 77.8857650756836, + "distillation_loss": 2.673177719116211, + "epoch": 3.06, + "learning_rate": 3.8564853949469336e-05, + "loss": 80.9399, + "step": 3618, + "task_loss": 1.6172763109207153 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7288572212659681, + "compression/movement_sparsity/importance_threshold": -0.00189901212050021, + "compression/movement_sparsity/linear_layer_sparsity": 0.7197790914394779, + "compression/movement_sparsity/model_sparsity": 0.6950524653490484, + "compression_loss": 77.92314910888672, + "distillation_loss": 2.8687996864318848, + "epoch": 3.06, + "learning_rate": 3.856015779092703e-05, + "loss": 81.7723, + "step": 3619, + "task_loss": 1.2039071321487427 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7292108857153698, + "compression/movement_sparsity/importance_threshold": -0.0018965351484814827, + "compression/movement_sparsity/linear_layer_sparsity": 0.7201681651052685, + "compression/movement_sparsity/model_sparsity": 0.6954281731374884, + "compression_loss": 77.96052551269531, + "distillation_loss": 3.458604335784912, + "epoch": 3.06, + "learning_rate": 3.855546163238471e-05, + "loss": 80.8898, + "step": 3620, + "task_loss": 1.9407539367675781 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7295642424967181, + "compression/movement_sparsity/importance_threshold": -0.001894060331288215, + "compression/movement_sparsity/linear_layer_sparsity": 0.7204810314156982, + "compression/movement_sparsity/model_sparsity": 0.6957302915276656, + "compression_loss": 77.99787902832031, + "distillation_loss": 4.6278605461120605, + "epoch": 3.06, + "learning_rate": 3.8550765473842395e-05, + "loss": 82.0177, + "step": 3621, + "task_loss": 2.6036248207092285 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7299172917438987, + "compression/movement_sparsity/importance_threshold": -0.0018915876679827066, + "compression/movement_sparsity/linear_layer_sparsity": 0.7208555695211407, + "compression/movement_sparsity/model_sparsity": 0.6960919630969721, + "compression_loss": 78.03522491455078, + "distillation_loss": 4.99056339263916, + "epoch": 3.06, + "learning_rate": 3.854606931530009e-05, + "loss": 81.5843, + "step": 3622, + "task_loss": 2.6957998275756836 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7302700335907963, + "compression/movement_sparsity/importance_threshold": -0.0018891171576272629, + "compression/movement_sparsity/linear_layer_sparsity": 0.7212296068115425, + "compression/movement_sparsity/model_sparsity": 0.6964531510557753, + "compression_loss": 78.072509765625, + "distillation_loss": 3.8841280937194824, + "epoch": 3.06, + "learning_rate": 3.8541373156757775e-05, + "loss": 81.0477, + "step": 3623, + "task_loss": 1.9544757604599 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7306224681712964, + "compression/movement_sparsity/importance_threshold": -0.001886648799284187, + "compression/movement_sparsity/linear_layer_sparsity": 0.7216126945451798, + "compression/movement_sparsity/model_sparsity": 0.6968230785472465, + "compression_loss": 78.10975646972656, + "distillation_loss": 3.145758628845215, + "epoch": 3.06, + "learning_rate": 3.853667699821546e-05, + "loss": 81.7043, + "step": 3624, + "task_loss": 2.4154679775238037 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7309745956192836, + "compression/movement_sparsity/importance_threshold": -0.0018841825920157852, + "compression/movement_sparsity/linear_layer_sparsity": 0.7219546081279705, + "compression/movement_sparsity/model_sparsity": 0.6971532463466191, + "compression_loss": 78.14703369140625, + "distillation_loss": 2.746878147125244, + "epoch": 3.06, + "learning_rate": 3.853198083967315e-05, + "loss": 81.6506, + "step": 3625, + "task_loss": 1.4255664348602295 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7313264160686437, + "compression/movement_sparsity/importance_threshold": -0.0018817185348843574, + "compression/movement_sparsity/linear_layer_sparsity": 0.7223441349121312, + "compression/movement_sparsity/model_sparsity": 0.6975293916874195, + "compression_loss": 78.18419647216797, + "distillation_loss": 4.186704158782959, + "epoch": 3.07, + "learning_rate": 3.852728468113084e-05, + "loss": 81.5808, + "step": 3626, + "task_loss": 2.326265335083008 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7316779296532612, + "compression/movement_sparsity/importance_threshold": -0.0018792566269522125, + "compression/movement_sparsity/linear_layer_sparsity": 0.7226935964930356, + "compression/movement_sparsity/model_sparsity": 0.69786684818795, + "compression_loss": 78.22138214111328, + "distillation_loss": 3.167693853378296, + "epoch": 3.07, + "learning_rate": 3.852258852258853e-05, + "loss": 81.2427, + "step": 3627, + "task_loss": 1.7537399530410767 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7320291365070214, + "compression/movement_sparsity/importance_threshold": -0.0018767968672816528, + "compression/movement_sparsity/linear_layer_sparsity": 0.7231088794792897, + "compression/movement_sparsity/model_sparsity": 0.6982678649260665, + "compression_loss": 78.25849151611328, + "distillation_loss": 3.4511637687683105, + "epoch": 3.07, + "learning_rate": 3.8517892364046206e-05, + "loss": 81.4805, + "step": 3628, + "task_loss": 2.6254754066467285 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.73238003676381, + "compression/movement_sparsity/importance_threshold": -0.0018743392549349794, + "compression/movement_sparsity/linear_layer_sparsity": 0.7234880680101101, + "compression/movement_sparsity/model_sparsity": 0.6986340271643329, + "compression_loss": 78.29554748535156, + "distillation_loss": 4.300675392150879, + "epoch": 3.07, + "learning_rate": 3.85131962055039e-05, + "loss": 81.9237, + "step": 3629, + "task_loss": 2.7455594539642334 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7327306305575114, + "compression/movement_sparsity/importance_threshold": -0.0018718837889744998, + "compression/movement_sparsity/linear_layer_sparsity": 0.7239467668907266, + "compression/movement_sparsity/model_sparsity": 0.6990769683272773, + "compression_loss": 78.33262634277344, + "distillation_loss": 2.9245405197143555, + "epoch": 3.07, + "learning_rate": 3.8508500046961586e-05, + "loss": 80.9703, + "step": 3630, + "task_loss": 0.8655033707618713 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7330809180220114, + "compression/movement_sparsity/importance_threshold": -0.0018694304684625154, + "compression/movement_sparsity/linear_layer_sparsity": 0.7243793518442204, + "compression/movement_sparsity/model_sparsity": 0.6994946926568316, + "compression_loss": 78.36958312988281, + "distillation_loss": 2.9687905311584473, + "epoch": 3.07, + "learning_rate": 3.850380388841928e-05, + "loss": 81.3497, + "step": 3631, + "task_loss": 2.8129398822784424 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7334308992911944, + "compression/movement_sparsity/importance_threshold": -0.0018669792924613333, + "compression/movement_sparsity/linear_layer_sparsity": 0.724774876484702, + "compression/movement_sparsity/model_sparsity": 0.6998766298091366, + "compression_loss": 78.4065170288086, + "distillation_loss": 4.348560810089111, + "epoch": 3.07, + "learning_rate": 3.849910772987696e-05, + "loss": 82.2525, + "step": 3632, + "task_loss": 2.106330633163452 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7337805744989463, + "compression/movement_sparsity/importance_threshold": -0.0018645302600332541, + "compression/movement_sparsity/linear_layer_sparsity": 0.7250536635240284, + "compression/movement_sparsity/model_sparsity": 0.700145839656013, + "compression_loss": 78.44348907470703, + "distillation_loss": 4.152263641357422, + "epoch": 3.07, + "learning_rate": 3.849441157133465e-05, + "loss": 82.0938, + "step": 3633, + "task_loss": 1.9087483882904053 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7341299437791516, + "compression/movement_sparsity/importance_threshold": -0.0018620833702405843, + "compression/movement_sparsity/linear_layer_sparsity": 0.7254442634832764, + "compression/movement_sparsity/model_sparsity": 0.7005230213050347, + "compression_loss": 78.48028564453125, + "distillation_loss": 3.975745439529419, + "epoch": 3.07, + "learning_rate": 3.848971541279234e-05, + "loss": 81.7384, + "step": 3634, + "task_loss": 2.539940595626831 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.734479007265696, + "compression/movement_sparsity/importance_threshold": -0.0018596386221456252, + "compression/movement_sparsity/linear_layer_sparsity": 0.7257643200667906, + "compression/movement_sparsity/model_sparsity": 0.700832082960296, + "compression_loss": 78.51718139648438, + "distillation_loss": 4.288151741027832, + "epoch": 3.07, + "learning_rate": 3.8485019254250024e-05, + "loss": 81.735, + "step": 3635, + "task_loss": 2.3239364624023438 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7348277650924642, + "compression/movement_sparsity/importance_threshold": -0.001857196014810685, + "compression/movement_sparsity/linear_layer_sparsity": 0.726067635118944, + "compression/movement_sparsity/model_sparsity": 0.7011249782073017, + "compression_loss": 78.55399322509766, + "distillation_loss": 2.8438634872436523, + "epoch": 3.07, + "learning_rate": 3.848032309570772e-05, + "loss": 81.3079, + "step": 3636, + "task_loss": 1.6512243747711182 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7351762173933416, + "compression/movement_sparsity/importance_threshold": -0.0018547555472980632, + "compression/movement_sparsity/linear_layer_sparsity": 0.7263469825941494, + "compression/movement_sparsity/model_sparsity": 0.7013947292373603, + "compression_loss": 78.59081268310547, + "distillation_loss": 2.2527084350585938, + "epoch": 3.07, + "learning_rate": 3.84756269371654e-05, + "loss": 81.3794, + "step": 3637, + "task_loss": 1.198965311050415 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7355243643022131, + "compression/movement_sparsity/importance_threshold": -0.0018523172186700665, + "compression/movement_sparsity/linear_layer_sparsity": 0.7266417241697727, + "compression/movement_sparsity/model_sparsity": 0.7016793455331298, + "compression_loss": 78.62754821777344, + "distillation_loss": 3.71903657913208, + "epoch": 3.08, + "learning_rate": 3.847093077862309e-05, + "loss": 81.5244, + "step": 3638, + "task_loss": 2.1009232997894287 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7358722059529639, + "compression/movement_sparsity/importance_threshold": -0.0018498810279889994, + "compression/movement_sparsity/linear_layer_sparsity": 0.726897592958903, + "compression/movement_sparsity/model_sparsity": 0.7019264244422091, + "compression_loss": 78.66429901123047, + "distillation_loss": 3.2995386123657227, + "epoch": 3.08, + "learning_rate": 3.8466234620080777e-05, + "loss": 81.7971, + "step": 3639, + "task_loss": 1.203376054763794 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7362197424794796, + "compression/movement_sparsity/importance_threshold": -0.0018474469743171608, + "compression/movement_sparsity/linear_layer_sparsity": 0.7272286078524746, + "compression/movement_sparsity/model_sparsity": 0.7022460679558655, + "compression_loss": 78.70101165771484, + "distillation_loss": 2.769207000732422, + "epoch": 3.08, + "learning_rate": 3.846153846153846e-05, + "loss": 82.1276, + "step": 3640, + "task_loss": 2.1956887245178223 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7365669740156446, + "compression/movement_sparsity/importance_threshold": -0.0018450150567168623, + "compression/movement_sparsity/linear_layer_sparsity": 0.7275413906937309, + "compression/movement_sparsity/model_sparsity": 0.7025481057442922, + "compression_loss": 78.73765563964844, + "distillation_loss": 3.2514421939849854, + "epoch": 3.08, + "learning_rate": 3.845684230299615e-05, + "loss": 82.4288, + "step": 3641, + "task_loss": 2.4310896396636963 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7369139006953448, + "compression/movement_sparsity/importance_threshold": -0.001842585274250401, + "compression/movement_sparsity/linear_layer_sparsity": 0.7279702791552576, + "compression/movement_sparsity/model_sparsity": 0.7029622605677502, + "compression_loss": 78.77429962158203, + "distillation_loss": 1.6661925315856934, + "epoch": 3.08, + "learning_rate": 3.8452146144453836e-05, + "loss": 82.3089, + "step": 3642, + "task_loss": 1.5886659622192383 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7372605226524647, + "compression/movement_sparsity/importance_threshold": -0.0018401576259800857, + "compression/movement_sparsity/linear_layer_sparsity": 0.7283275510659634, + "compression/movement_sparsity/model_sparsity": 0.7033072590892262, + "compression_loss": 78.81092834472656, + "distillation_loss": 3.1796875, + "epoch": 3.08, + "learning_rate": 3.844744998591153e-05, + "loss": 82.006, + "step": 3643, + "task_loss": 2.089752435684204 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7376068400208897, + "compression/movement_sparsity/importance_threshold": -0.001837732110968218, + "compression/movement_sparsity/linear_layer_sparsity": 0.7286816153755751, + "compression/movement_sparsity/model_sparsity": 0.7036491602005736, + "compression_loss": 78.84752655029297, + "distillation_loss": 2.6766669750213623, + "epoch": 3.08, + "learning_rate": 3.8442753827369215e-05, + "loss": 81.9107, + "step": 3644, + "task_loss": 1.5094844102859497 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.737952852934505, + "compression/movement_sparsity/importance_threshold": -0.0018353087282771026, + "compression/movement_sparsity/linear_layer_sparsity": 0.7290775573619238, + "compression/movement_sparsity/model_sparsity": 0.7040315003616313, + "compression_loss": 78.88410186767578, + "distillation_loss": 3.699563980102539, + "epoch": 3.08, + "learning_rate": 3.84380576688269e-05, + "loss": 81.8898, + "step": 3645, + "task_loss": 1.863808035850525 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7382985615271958, + "compression/movement_sparsity/importance_threshold": -0.0018328874769690424, + "compression/movement_sparsity/linear_layer_sparsity": 0.7294015727690931, + "compression/movement_sparsity/model_sparsity": 0.7043443848427763, + "compression_loss": 78.92060089111328, + "distillation_loss": 3.2535645961761475, + "epoch": 3.08, + "learning_rate": 3.843336151028459e-05, + "loss": 81.9191, + "step": 3646, + "task_loss": 1.5681560039520264 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7386439659328468, + "compression/movement_sparsity/importance_threshold": -0.001830468356106345, + "compression/movement_sparsity/linear_layer_sparsity": 0.7298259896677565, + "compression/movement_sparsity/model_sparsity": 0.7047542217153114, + "compression_loss": 78.95710754394531, + "distillation_loss": 3.8319528102874756, + "epoch": 3.08, + "learning_rate": 3.8428665351742274e-05, + "loss": 81.7564, + "step": 3647, + "task_loss": 1.8651117086410522 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7389890662853433, + "compression/movement_sparsity/importance_threshold": -0.0018280513647513114, + "compression/movement_sparsity/linear_layer_sparsity": 0.7302781183320055, + "compression/movement_sparsity/model_sparsity": 0.705190818369033, + "compression_loss": 78.99356079101562, + "distillation_loss": 3.7793290615081787, + "epoch": 3.08, + "learning_rate": 3.842396919319997e-05, + "loss": 82.434, + "step": 3648, + "task_loss": 1.4207701683044434 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.739333862718571, + "compression/movement_sparsity/importance_threshold": -0.0018256365019662431, + "compression/movement_sparsity/linear_layer_sparsity": 0.7306252904727236, + "compression/movement_sparsity/model_sparsity": 0.7055260640786911, + "compression_loss": 79.02995300292969, + "distillation_loss": 2.865920066833496, + "epoch": 3.08, + "learning_rate": 3.8419273034657653e-05, + "loss": 82.3231, + "step": 3649, + "task_loss": 1.8904931545257568 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7396783553664146, + "compression/movement_sparsity/importance_threshold": -0.0018232237668134484, + "compression/movement_sparsity/linear_layer_sparsity": 0.7310068280645683, + "compression/movement_sparsity/model_sparsity": 0.705894494680509, + "compression_loss": 79.06633758544922, + "distillation_loss": 3.8554601669311523, + "epoch": 3.09, + "learning_rate": 3.841457687611534e-05, + "loss": 82.5304, + "step": 3650, + "task_loss": 2.560987949371338 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7400225443627593, + "compression/movement_sparsity/importance_threshold": -0.0018208131583552292, + "compression/movement_sparsity/linear_layer_sparsity": 0.7312806546501582, + "compression/movement_sparsity/model_sparsity": 0.7061589144804948, + "compression_loss": 79.10265350341797, + "distillation_loss": 1.7472658157348633, + "epoch": 3.09, + "learning_rate": 3.8409880717573026e-05, + "loss": 82.0822, + "step": 3651, + "task_loss": 1.467380404472351 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.74036642984149, + "compression/movement_sparsity/importance_threshold": -0.0018184046756538905, + "compression/movement_sparsity/linear_layer_sparsity": 0.7316225443846137, + "compression/movement_sparsity/model_sparsity": 0.7064890592507959, + "compression_loss": 79.13899230957031, + "distillation_loss": 4.3007049560546875, + "epoch": 3.09, + "learning_rate": 3.840518455903071e-05, + "loss": 82.6115, + "step": 3652, + "task_loss": 2.3277735710144043 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7407100119364922, + "compression/movement_sparsity/importance_threshold": -0.001815998317771736, + "compression/movement_sparsity/linear_layer_sparsity": 0.7320303151452573, + "compression/movement_sparsity/model_sparsity": 0.7068828218313619, + "compression_loss": 79.17524719238281, + "distillation_loss": 3.8125357627868652, + "epoch": 3.09, + "learning_rate": 3.8400488400488406e-05, + "loss": 82.2469, + "step": 3653, + "task_loss": 2.9564433097839355 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7410532907816508, + "compression/movement_sparsity/importance_threshold": -0.00181359408377107, + "compression/movement_sparsity/linear_layer_sparsity": 0.7323878016909805, + "compression/movement_sparsity/model_sparsity": 0.7072280276144821, + "compression_loss": 79.21153259277344, + "distillation_loss": 3.332451343536377, + "epoch": 3.09, + "learning_rate": 3.8395792241946085e-05, + "loss": 82.7035, + "step": 3654, + "task_loss": 2.9230785369873047 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7413962665108513, + "compression/movement_sparsity/importance_threshold": -0.0018111919727141932, + "compression/movement_sparsity/linear_layer_sparsity": 0.732878588506705, + "compression/movement_sparsity/model_sparsity": 0.7077019543932497, + "compression_loss": 79.24777221679688, + "distillation_loss": 2.7436177730560303, + "epoch": 3.09, + "learning_rate": 3.839109608340378e-05, + "loss": 82.1883, + "step": 3655, + "task_loss": 1.7471091747283936 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7417389392579783, + "compression/movement_sparsity/importance_threshold": -0.0018087919836634152, + "compression/movement_sparsity/linear_layer_sparsity": 0.7331544541249607, + "compression/movement_sparsity/model_sparsity": 0.7079683431788564, + "compression_loss": 79.28392791748047, + "distillation_loss": 3.8545992374420166, + "epoch": 3.09, + "learning_rate": 3.8386399924861465e-05, + "loss": 82.9038, + "step": 3656, + "task_loss": 2.3255913257598877 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7420813091569174, + "compression/movement_sparsity/importance_threshold": -0.0018063941156810362, + "compression/movement_sparsity/linear_layer_sparsity": 0.7335390562278878, + "compression/movement_sparsity/model_sparsity": 0.7083397330163734, + "compression_loss": 79.32008361816406, + "distillation_loss": 3.248971939086914, + "epoch": 3.09, + "learning_rate": 3.838170376631916e-05, + "loss": 83.0756, + "step": 3657, + "task_loss": 1.86123788356781 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7424233763415533, + "compression/movement_sparsity/importance_threshold": -0.0018039983678293618, + "compression/movement_sparsity/linear_layer_sparsity": 0.7339600508904396, + "compression/movement_sparsity/model_sparsity": 0.7087462652171356, + "compression_loss": 79.35621643066406, + "distillation_loss": 4.9636030197143555, + "epoch": 3.09, + "learning_rate": 3.837700760777684e-05, + "loss": 83.3769, + "step": 3658, + "task_loss": 3.1896374225616455 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7427651409457716, + "compression/movement_sparsity/importance_threshold": -0.0018016047391706943, + "compression/movement_sparsity/linear_layer_sparsity": 0.7343515570864279, + "compression/movement_sparsity/model_sparsity": 0.7091243219708777, + "compression_loss": 79.39234924316406, + "distillation_loss": 3.953709125518799, + "epoch": 3.09, + "learning_rate": 3.837231144923453e-05, + "loss": 83.1929, + "step": 3659, + "task_loss": 2.3795108795166016 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7431066031034573, + "compression/movement_sparsity/importance_threshold": -0.0017992132287673384, + "compression/movement_sparsity/linear_layer_sparsity": 0.7347732910473731, + "compression/movement_sparsity/model_sparsity": 0.709531568072859, + "compression_loss": 79.42837524414062, + "distillation_loss": 5.477741241455078, + "epoch": 3.09, + "learning_rate": 3.836761529069222e-05, + "loss": 83.1869, + "step": 3660, + "task_loss": 2.803633451461792 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7434477629484952, + "compression/movement_sparsity/importance_threshold": -0.0017968238356815998, + "compression/movement_sparsity/linear_layer_sparsity": 0.7351081693712587, + "compression/movement_sparsity/model_sparsity": 0.7098549422961129, + "compression_loss": 79.46440124511719, + "distillation_loss": 4.0181097984313965, + "epoch": 3.09, + "learning_rate": 3.83629191321499e-05, + "loss": 82.752, + "step": 3661, + "task_loss": 2.931694984436035 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7437886206147711, + "compression/movement_sparsity/importance_threshold": -0.0017944365589757788, + "compression/movement_sparsity/linear_layer_sparsity": 0.7354431430884854, + "compression/movement_sparsity/model_sparsity": 0.7101784086356532, + "compression_loss": 79.50037384033203, + "distillation_loss": 4.247005462646484, + "epoch": 3.1, + "learning_rate": 3.835822297360759e-05, + "loss": 82.8285, + "step": 3662, + "task_loss": 2.5462474822998047 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7441291762361695, + "compression/movement_sparsity/importance_threshold": -0.0017920513977121837, + "compression/movement_sparsity/linear_layer_sparsity": 0.7359013530782287, + "compression/movement_sparsity/model_sparsity": 0.7106208777026299, + "compression_loss": 79.536376953125, + "distillation_loss": 4.422715663909912, + "epoch": 3.1, + "learning_rate": 3.8353526815065276e-05, + "loss": 83.3995, + "step": 3663, + "task_loss": 3.2990036010742188 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7444694299465761, + "compression/movement_sparsity/importance_threshold": -0.0017896683509531141, + "compression/movement_sparsity/linear_layer_sparsity": 0.7363274393603609, + "compression/movement_sparsity/model_sparsity": 0.7110323266101762, + "compression_loss": 79.5722427368164, + "distillation_loss": 3.418513298034668, + "epoch": 3.1, + "learning_rate": 3.834883065652297e-05, + "loss": 83.3651, + "step": 3664, + "task_loss": 3.4073374271392822 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7448093818798753, + "compression/movement_sparsity/importance_threshold": -0.001787287417760879, + "compression/movement_sparsity/linear_layer_sparsity": 0.736693380140938, + "compression/movement_sparsity/model_sparsity": 0.7113856961991749, + "compression_loss": 79.60807037353516, + "distillation_loss": 2.824603796005249, + "epoch": 3.1, + "learning_rate": 3.8344134497980655e-05, + "loss": 82.7088, + "step": 3665, + "task_loss": 2.347618341445923 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7451490321699531, + "compression/movement_sparsity/importance_threshold": -0.0017849085971977765, + "compression/movement_sparsity/linear_layer_sparsity": 0.7371311044106829, + "compression/movement_sparsity/model_sparsity": 0.7118083832936567, + "compression_loss": 79.64395904541016, + "distillation_loss": 4.204372406005859, + "epoch": 3.1, + "learning_rate": 3.833943833943834e-05, + "loss": 83.3589, + "step": 3666, + "task_loss": 1.3465356826782227 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7454883809506939, + "compression/movement_sparsity/importance_threshold": -0.0017825318883261162, + "compression/movement_sparsity/linear_layer_sparsity": 0.737431188013407, + "compression/movement_sparsity/model_sparsity": 0.7120981581014622, + "compression_loss": 79.6798095703125, + "distillation_loss": 3.618772506713867, + "epoch": 3.1, + "learning_rate": 3.833474218089603e-05, + "loss": 83.5978, + "step": 3667, + "task_loss": 1.5472601652145386 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7458274283559831, + "compression/movement_sparsity/importance_threshold": -0.0017801572902082004, + "compression/movement_sparsity/linear_layer_sparsity": 0.7377192401309866, + "compression/movement_sparsity/model_sparsity": 0.7123763147426508, + "compression_loss": 79.71559143066406, + "distillation_loss": 3.541869640350342, + "epoch": 3.1, + "learning_rate": 3.8330046022353714e-05, + "loss": 83.2007, + "step": 3668, + "task_loss": 1.3770487308502197 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7461661745197063, + "compression/movement_sparsity/importance_threshold": -0.0017777848019063297, + "compression/movement_sparsity/linear_layer_sparsity": 0.7380524609955706, + "compression/movement_sparsity/model_sparsity": 0.7126980884454294, + "compression_loss": 79.75137329101562, + "distillation_loss": 2.7551233768463135, + "epoch": 3.1, + "learning_rate": 3.832534986381141e-05, + "loss": 82.8538, + "step": 3669, + "task_loss": 2.6314852237701416 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7465046195757481, + "compression/movement_sparsity/importance_threshold": -0.0017754144224828112, + "compression/movement_sparsity/linear_layer_sparsity": 0.7383105119073784, + "compression/movement_sparsity/model_sparsity": 0.712947274514559, + "compression_loss": 79.78717041015625, + "distillation_loss": 2.686685562133789, + "epoch": 3.1, + "learning_rate": 3.8320653705269094e-05, + "loss": 83.3442, + "step": 3670, + "task_loss": 1.5284781455993652 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7468427636579938, + "compression/movement_sparsity/importance_threshold": -0.0017730461509999481, + "compression/movement_sparsity/linear_layer_sparsity": 0.7386255364921502, + "compression/movement_sparsity/model_sparsity": 0.7132514770357149, + "compression_loss": 79.82286071777344, + "distillation_loss": 4.080593585968018, + "epoch": 3.1, + "learning_rate": 3.831595754672678e-05, + "loss": 83.3257, + "step": 3671, + "task_loss": 2.463806629180908 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7471806069003284, + "compression/movement_sparsity/importance_threshold": -0.001770679986520046, + "compression/movement_sparsity/linear_layer_sparsity": 0.7389662934306803, + "compression/movement_sparsity/model_sparsity": 0.7135805279251155, + "compression_loss": 79.85858154296875, + "distillation_loss": 3.674704074859619, + "epoch": 3.1, + "learning_rate": 3.8311261388184466e-05, + "loss": 83.2212, + "step": 3672, + "task_loss": 2.4121415615081787 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7475181494366374, + "compression/movement_sparsity/importance_threshold": -0.0017683159281054054, + "compression/movement_sparsity/linear_layer_sparsity": 0.7392809841387584, + "compression/movement_sparsity/model_sparsity": 0.7138844080392692, + "compression_loss": 79.89419555664062, + "distillation_loss": 3.373446226119995, + "epoch": 3.1, + "learning_rate": 3.830656522964215e-05, + "loss": 82.855, + "step": 3673, + "task_loss": 1.2321401834487915 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7478553914008055, + "compression/movement_sparsity/importance_threshold": -0.0017659539748183344, + "compression/movement_sparsity/linear_layer_sparsity": 0.7395644812243011, + "compression/movement_sparsity/model_sparsity": 0.7141581661277845, + "compression_loss": 79.9298324584961, + "distillation_loss": 3.794870138168335, + "epoch": 3.11, + "learning_rate": 3.8301869071099846e-05, + "loss": 83.7096, + "step": 3674, + "task_loss": 1.777003288269043 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7481923329267182, + "compression/movement_sparsity/importance_threshold": -0.0017635941257211334, + "compression/movement_sparsity/linear_layer_sparsity": 0.7398825583959877, + "compression/movement_sparsity/model_sparsity": 0.7144653163701038, + "compression_loss": 79.9654312133789, + "distillation_loss": 3.687852621078491, + "epoch": 3.11, + "learning_rate": 3.8297172912557525e-05, + "loss": 83.5923, + "step": 3675, + "task_loss": 2.527103900909424 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7485289741482603, + "compression/movement_sparsity/importance_threshold": -0.00176123637987611, + "compression/movement_sparsity/linear_layer_sparsity": 0.7402259624997329, + "compression/movement_sparsity/model_sparsity": 0.7147969234864509, + "compression_loss": 80.00098419189453, + "distillation_loss": 4.270294189453125, + "epoch": 3.11, + "learning_rate": 3.829247675401522e-05, + "loss": 83.8908, + "step": 3676, + "task_loss": 2.380890369415283 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7488653151993174, + "compression/movement_sparsity/importance_threshold": -0.0017588807363455626, + "compression/movement_sparsity/linear_layer_sparsity": 0.7404894508319826, + "compression/movement_sparsity/model_sparsity": 0.7150513601839028, + "compression_loss": 80.0364990234375, + "distillation_loss": 2.9639081954956055, + "epoch": 3.11, + "learning_rate": 3.8287780595472905e-05, + "loss": 83.9567, + "step": 3677, + "task_loss": 2.479766607284546 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.749201356213774, + "compression/movement_sparsity/importance_threshold": -0.0017565271941918023, + "compression/movement_sparsity/linear_layer_sparsity": 0.7409115425179569, + "compression/movement_sparsity/model_sparsity": 0.715458951721958, + "compression_loss": 80.07195281982422, + "distillation_loss": 2.8811328411102295, + "epoch": 3.11, + "learning_rate": 3.828308443693059e-05, + "loss": 83.4629, + "step": 3678, + "task_loss": 1.3239631652832031 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7495370973255158, + "compression/movement_sparsity/importance_threshold": -0.0017541757524771285, + "compression/movement_sparsity/linear_layer_sparsity": 0.7412424143215167, + "compression/movement_sparsity/model_sparsity": 0.7157784570611848, + "compression_loss": 80.10746765136719, + "distillation_loss": 2.1937625408172607, + "epoch": 3.11, + "learning_rate": 3.827838827838828e-05, + "loss": 83.4617, + "step": 3679, + "task_loss": 0.9094792604446411 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7498725386684278, + "compression/movement_sparsity/importance_threshold": -0.0017518264102638443, + "compression/movement_sparsity/linear_layer_sparsity": 0.7415643668476851, + "compression/movement_sparsity/model_sparsity": 0.7160893495276375, + "compression_loss": 80.14289093017578, + "distillation_loss": 4.416505813598633, + "epoch": 3.11, + "learning_rate": 3.8273692119845964e-05, + "loss": 83.4784, + "step": 3680, + "task_loss": 2.572610378265381 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7502076803763951, + "compression/movement_sparsity/importance_threshold": -0.001749479166614257, + "compression/movement_sparsity/linear_layer_sparsity": 0.7418542910595834, + "compression/movement_sparsity/model_sparsity": 0.7163693139509459, + "compression_loss": 80.17829132080078, + "distillation_loss": 3.7247090339660645, + "epoch": 3.11, + "learning_rate": 3.826899596130366e-05, + "loss": 83.4595, + "step": 3681, + "task_loss": 2.460824489593506 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7505425225833029, + "compression/movement_sparsity/importance_threshold": -0.001747134020590668, + "compression/movement_sparsity/linear_layer_sparsity": 0.7421404949311794, + "compression/movement_sparsity/model_sparsity": 0.7166456858390865, + "compression_loss": 80.21363830566406, + "distillation_loss": 4.373508453369141, + "epoch": 3.11, + "learning_rate": 3.826429980276134e-05, + "loss": 83.7789, + "step": 3682, + "task_loss": 2.1487691402435303 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.750877065423036, + "compression/movement_sparsity/importance_threshold": -0.0017447909712553847, + "compression/movement_sparsity/linear_layer_sparsity": 0.7425370450500777, + "compression/movement_sparsity/model_sparsity": 0.7170286132414697, + "compression_loss": 80.24897003173828, + "distillation_loss": 3.018580675125122, + "epoch": 3.11, + "learning_rate": 3.8259603644219036e-05, + "loss": 83.8925, + "step": 3683, + "task_loss": 1.8671622276306152 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7512113090294802, + "compression/movement_sparsity/importance_threshold": -0.0017424500176707056, + "compression/movement_sparsity/linear_layer_sparsity": 0.7430302166993293, + "compression/movement_sparsity/model_sparsity": 0.7175048429273961, + "compression_loss": 80.2842788696289, + "distillation_loss": 3.3490335941314697, + "epoch": 3.11, + "learning_rate": 3.8254907485676716e-05, + "loss": 84.429, + "step": 3684, + "task_loss": 1.9273213148117065 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7515452535365199, + "compression/movement_sparsity/importance_threshold": -0.0017401111588989408, + "compression/movement_sparsity/linear_layer_sparsity": 0.7433543274998398, + "compression/movement_sparsity/model_sparsity": 0.7178178195248276, + "compression_loss": 80.31957244873047, + "distillation_loss": 4.245872974395752, + "epoch": 3.11, + "learning_rate": 3.82502113271344e-05, + "loss": 83.8295, + "step": 3685, + "task_loss": 1.7592244148254395 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.751878899078041, + "compression/movement_sparsity/importance_threshold": -0.001737774394002388, + "compression/movement_sparsity/linear_layer_sparsity": 0.743680536953854, + "compression/movement_sparsity/model_sparsity": 0.7181328226805588, + "compression_loss": 80.35487365722656, + "distillation_loss": 4.044859409332275, + "epoch": 3.12, + "learning_rate": 3.8245515168592095e-05, + "loss": 83.2428, + "step": 3686, + "task_loss": 1.762267827987671 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7522122457879278, + "compression/movement_sparsity/importance_threshold": -0.0017354397220433583, + "compression/movement_sparsity/linear_layer_sparsity": 0.7439549239753229, + "compression/movement_sparsity/model_sparsity": 0.718397783663727, + "compression_loss": 80.39007568359375, + "distillation_loss": 4.115485191345215, + "epoch": 3.12, + "learning_rate": 3.824081901004978e-05, + "loss": 83.6271, + "step": 3687, + "task_loss": 2.304518222808838 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7525452938000659, + "compression/movement_sparsity/importance_threshold": -0.001733107142084151, + "compression/movement_sparsity/linear_layer_sparsity": 0.7442654769972313, + "compression/movement_sparsity/model_sparsity": 0.7186976682339601, + "compression_loss": 80.42536926269531, + "distillation_loss": 3.246710777282715, + "epoch": 3.12, + "learning_rate": 3.823612285150747e-05, + "loss": 83.7131, + "step": 3688, + "task_loss": 1.9812899827957153 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7528780432483407, + "compression/movement_sparsity/importance_threshold": -0.0017307766531870685, + "compression/movement_sparsity/linear_layer_sparsity": 0.7446811654051851, + "compression/movement_sparsity/model_sparsity": 0.7190990764662936, + "compression_loss": 80.4605712890625, + "distillation_loss": 3.553994655609131, + "epoch": 3.12, + "learning_rate": 3.8231426692965154e-05, + "loss": 83.2559, + "step": 3689, + "task_loss": 2.5499186515808105 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7532104942666369, + "compression/movement_sparsity/importance_threshold": -0.0017284482544144188, + "compression/movement_sparsity/linear_layer_sparsity": 0.7450368514015953, + "compression/movement_sparsity/model_sparsity": 0.7194425435545089, + "compression_loss": 80.49568939208984, + "distillation_loss": 3.2814273834228516, + "epoch": 3.12, + "learning_rate": 3.822673053442285e-05, + "loss": 83.4977, + "step": 3690, + "task_loss": 2.414238929748535 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.75354264698884, + "compression/movement_sparsity/importance_threshold": -0.0017261219448285034, + "compression/movement_sparsity/linear_layer_sparsity": 0.7452704339214142, + "compression/movement_sparsity/model_sparsity": 0.7196681017961881, + "compression_loss": 80.53082275390625, + "distillation_loss": 2.927618980407715, + "epoch": 3.12, + "learning_rate": 3.8222034375880534e-05, + "loss": 84.1464, + "step": 3691, + "task_loss": 2.2867934703826904 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7538745015488346, + "compression/movement_sparsity/importance_threshold": -0.0017237977234916286, + "compression/movement_sparsity/linear_layer_sparsity": 0.7455817381658837, + "compression/movement_sparsity/model_sparsity": 0.7199687117821763, + "compression_loss": 80.56590270996094, + "distillation_loss": 4.218554973602295, + "epoch": 3.12, + "learning_rate": 3.8217338217338214e-05, + "loss": 84.183, + "step": 3692, + "task_loss": 2.631753444671631 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7542060580805066, + "compression/movement_sparsity/importance_threshold": -0.001721475589466095, + "compression/movement_sparsity/linear_layer_sparsity": 0.745932571026066, + "compression/movement_sparsity/model_sparsity": 0.7203074924543232, + "compression_loss": 80.6009292602539, + "distillation_loss": 2.8070919513702393, + "epoch": 3.12, + "learning_rate": 3.821264205879591e-05, + "loss": 83.7731, + "step": 3693, + "task_loss": 2.3226654529571533 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7545373167177403, + "compression/movement_sparsity/importance_threshold": -0.0017191555418142115, + "compression/movement_sparsity/linear_layer_sparsity": 0.7462844770613356, + "compression/movement_sparsity/model_sparsity": 0.7206473094346917, + "compression_loss": 80.63587188720703, + "distillation_loss": 4.210654258728027, + "epoch": 3.12, + "learning_rate": 3.820794590025359e-05, + "loss": 84.3579, + "step": 3694, + "task_loss": 1.7326912879943848 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7548682775944217, + "compression/movement_sparsity/importance_threshold": -0.0017168375795982753, + "compression/movement_sparsity/linear_layer_sparsity": 0.746650894808618, + "compression/movement_sparsity/model_sparsity": 0.7210011396051221, + "compression_loss": 80.67080688476562, + "distillation_loss": 3.219837188720703, + "epoch": 3.12, + "learning_rate": 3.8203249741711286e-05, + "loss": 83.559, + "step": 3695, + "task_loss": 2.899458408355713 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7551989408444351, + "compression/movement_sparsity/importance_threshold": -0.0017145217018805971, + "compression/movement_sparsity/linear_layer_sparsity": 0.7469993070627703, + "compression/movement_sparsity/model_sparsity": 0.7213375828265028, + "compression_loss": 80.70569610595703, + "distillation_loss": 3.9166975021362305, + "epoch": 3.12, + "learning_rate": 3.819855358316897e-05, + "loss": 83.8487, + "step": 3696, + "task_loss": 3.231672763824463 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7555293066016664, + "compression/movement_sparsity/importance_threshold": -0.0017122079077234757, + "compression/movement_sparsity/linear_layer_sparsity": 0.7471278376657173, + "compression/movement_sparsity/model_sparsity": 0.7214616980078322, + "compression_loss": 80.7405776977539, + "distillation_loss": 4.193362712860107, + "epoch": 3.13, + "learning_rate": 3.819385742462666e-05, + "loss": 84.9088, + "step": 3697, + "task_loss": 3.107306718826294 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.755859375, + "compression/movement_sparsity/importance_threshold": -0.00170989619618922, + "compression/movement_sparsity/linear_layer_sparsity": 0.7475332116686662, + "compression/movement_sparsity/model_sparsity": 0.7218531461667034, + "compression_loss": 80.77540588378906, + "distillation_loss": 5.4476823806762695, + "epoch": 3.13, + "learning_rate": 3.8189161266084345e-05, + "loss": 85.152, + "step": 3698, + "task_loss": 3.258300542831421 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7561891461733214, + "compression/movement_sparsity/importance_threshold": -0.0017075865663401316, + "compression/movement_sparsity/linear_layer_sparsity": 0.7479199124250971, + "compression/movement_sparsity/model_sparsity": 0.7222265625625204, + "compression_loss": 80.81019592285156, + "distillation_loss": 3.840869665145874, + "epoch": 3.13, + "learning_rate": 3.818446510754203e-05, + "loss": 84.3903, + "step": 3699, + "task_loss": 3.404210329055786 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7565186202555162, + "compression/movement_sparsity/importance_threshold": -0.0017052790172385115, + "compression/movement_sparsity/linear_layer_sparsity": 0.7482691951434869, + "compression/movement_sparsity/model_sparsity": 0.722563846345014, + "compression_loss": 80.8449478149414, + "distillation_loss": 4.6749067306518555, + "epoch": 3.13, + "learning_rate": 3.8179768948999725e-05, + "loss": 84.6341, + "step": 3700, + "task_loss": 2.163327693939209 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7568477973804688, + "compression/movement_sparsity/importance_threshold": -0.0017029735479466672, + "compression/movement_sparsity/linear_layer_sparsity": 0.7484824827299895, + "compression/movement_sparsity/model_sparsity": 0.722769806846771, + "compression_loss": 80.87963104248047, + "distillation_loss": 4.222968101501465, + "epoch": 3.13, + "learning_rate": 3.8175072790457404e-05, + "loss": 85.2322, + "step": 3701, + "task_loss": 3.54156756401062 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7571766776820649, + "compression/movement_sparsity/importance_threshold": -0.0017006701575269017, + "compression/movement_sparsity/linear_layer_sparsity": 0.7488716994857917, + "compression/movement_sparsity/model_sparsity": 0.7231456528096406, + "compression_loss": 80.91436767578125, + "distillation_loss": 4.028522491455078, + "epoch": 3.13, + "learning_rate": 3.81703766319151e-05, + "loss": 84.5291, + "step": 3702, + "task_loss": 2.719282627105713 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.757505261294189, + "compression/movement_sparsity/importance_threshold": -0.0016983688450415207, + "compression/movement_sparsity/linear_layer_sparsity": 0.7492360424279055, + "compression/movement_sparsity/model_sparsity": 0.7234974794508428, + "compression_loss": 80.94901275634766, + "distillation_loss": 2.947445869445801, + "epoch": 3.13, + "learning_rate": 3.8165680473372784e-05, + "loss": 84.4737, + "step": 3703, + "task_loss": 1.441405177116394 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.757833548350727, + "compression/movement_sparsity/importance_threshold": -0.0016960696095528246, + "compression/movement_sparsity/linear_layer_sparsity": 0.7494379782068189, + "compression/movement_sparsity/model_sparsity": 0.7236924781145233, + "compression_loss": 80.98356628417969, + "distillation_loss": 4.459933757781982, + "epoch": 3.13, + "learning_rate": 3.816098431483047e-05, + "loss": 85.0449, + "step": 3704, + "task_loss": 2.2515008449554443 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7581615389855635, + "compression/movement_sparsity/importance_threshold": -0.0016937724501231207, + "compression/movement_sparsity/linear_layer_sparsity": 0.7498044794232748, + "compression/movement_sparsity/model_sparsity": 0.7240463888867044, + "compression_loss": 81.01819610595703, + "distillation_loss": 2.7761459350585938, + "epoch": 3.13, + "learning_rate": 3.8156288156288156e-05, + "loss": 84.6429, + "step": 3705, + "task_loss": 2.0169272422790527 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.758489233332584, + "compression/movement_sparsity/importance_threshold": -0.0016914773658147095, + "compression/movement_sparsity/linear_layer_sparsity": 0.7501835606365864, + "compression/movement_sparsity/model_sparsity": 0.7244124474941486, + "compression_loss": 81.052734375, + "distillation_loss": 4.765417098999023, + "epoch": 3.13, + "learning_rate": 3.815159199774584e-05, + "loss": 84.7367, + "step": 3706, + "task_loss": 2.3753676414489746 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7588166315256731, + "compression/movement_sparsity/importance_threshold": -0.0016891843556899, + "compression/movement_sparsity/linear_layer_sparsity": 0.7505824478923414, + "compression/movement_sparsity/model_sparsity": 0.7247976317455476, + "compression_loss": 81.08727264404297, + "distillation_loss": 4.686157703399658, + "epoch": 3.13, + "learning_rate": 3.8146895839203536e-05, + "loss": 84.6329, + "step": 3707, + "task_loss": 2.8983380794525146 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7591437336987167, + "compression/movement_sparsity/importance_threshold": -0.0016868934188109912, + "compression/movement_sparsity/linear_layer_sparsity": 0.7508614137941824, + "compression/movement_sparsity/model_sparsity": 0.7250670143104608, + "compression_loss": 81.1216812133789, + "distillation_loss": 2.9957194328308105, + "epoch": 3.13, + "learning_rate": 3.814219968066122e-05, + "loss": 84.8033, + "step": 3708, + "task_loss": 0.7069545388221741 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7594705399855991, + "compression/movement_sparsity/importance_threshold": -0.0016846045542402918, + "compression/movement_sparsity/linear_layer_sparsity": 0.7511791451650076, + "compression/movement_sparsity/model_sparsity": 0.7253738306312422, + "compression_loss": 81.15621185302734, + "distillation_loss": 4.061777591705322, + "epoch": 3.14, + "learning_rate": 3.813750352211891e-05, + "loss": 84.8296, + "step": 3709, + "task_loss": 2.2253663539886475 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.759797050520206, + "compression/movement_sparsity/importance_threshold": -0.0016823177610401025, + "compression/movement_sparsity/linear_layer_sparsity": 0.7514032598957237, + "compression/movement_sparsity/model_sparsity": 0.7255902463315006, + "compression_loss": 81.19059753417969, + "distillation_loss": 4.129062652587891, + "epoch": 3.14, + "learning_rate": 3.8132807363576595e-05, + "loss": 84.6115, + "step": 3710, + "task_loss": 1.870001196861267 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7601232654364226, + "compression/movement_sparsity/importance_threshold": -0.0016800330382727262, + "compression/movement_sparsity/linear_layer_sparsity": 0.7517578488687113, + "compression/movement_sparsity/model_sparsity": 0.7259326540824228, + "compression_loss": 81.22498321533203, + "distillation_loss": 3.150096893310547, + "epoch": 3.14, + "learning_rate": 3.812811120503428e-05, + "loss": 85.1965, + "step": 3711, + "task_loss": 3.007744312286377 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7604491848681336, + "compression/movement_sparsity/importance_threshold": -0.0016777503850004703, + "compression/movement_sparsity/linear_layer_sparsity": 0.7520319974068275, + "compression/movement_sparsity/model_sparsity": 0.726197384774875, + "compression_loss": 81.25930786132812, + "distillation_loss": 5.543707847595215, + "epoch": 3.14, + "learning_rate": 3.8123415046491974e-05, + "loss": 84.9724, + "step": 3712, + "task_loss": 2.6793017387390137 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7607748089492248, + "compression/movement_sparsity/importance_threshold": -0.0016754698002856344, + "compression/movement_sparsity/linear_layer_sparsity": 0.7523608182535539, + "compression/movement_sparsity/model_sparsity": 0.7265149096139454, + "compression_loss": 81.29362487792969, + "distillation_loss": 3.457920551300049, + "epoch": 3.14, + "learning_rate": 3.811871888794966e-05, + "loss": 84.4721, + "step": 3713, + "task_loss": 2.228994846343994 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7611001378135805, + "compression/movement_sparsity/importance_threshold": -0.0016731912831905284, + "compression/movement_sparsity/linear_layer_sparsity": 0.7528234163369872, + "compression/movement_sparsity/model_sparsity": 0.7269616160300946, + "compression_loss": 81.32791900634766, + "distillation_loss": 3.6463546752929688, + "epoch": 3.14, + "learning_rate": 3.811402272940735e-05, + "loss": 84.6943, + "step": 3714, + "task_loss": 1.452162742614746 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7614251715950866, + "compression/movement_sparsity/importance_threshold": -0.001670914832777451, + "compression/movement_sparsity/linear_layer_sparsity": 0.7531967501014986, + "compression/movement_sparsity/model_sparsity": 0.7273221246312859, + "compression_loss": 81.36219024658203, + "distillation_loss": 3.8088483810424805, + "epoch": 3.14, + "learning_rate": 3.810932657086503e-05, + "loss": 84.9145, + "step": 3715, + "task_loss": 1.6999133825302124 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7617499104276276, + "compression/movement_sparsity/importance_threshold": -0.0016686404481087097, + "compression/movement_sparsity/linear_layer_sparsity": 0.7535338343963968, + "compression/movement_sparsity/model_sparsity": 0.7276476290436618, + "compression_loss": 81.39639282226562, + "distillation_loss": 3.507528781890869, + "epoch": 3.14, + "learning_rate": 3.810463041232272e-05, + "loss": 84.301, + "step": 3716, + "task_loss": 1.8312020301818848 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7620743544450893, + "compression/movement_sparsity/importance_threshold": -0.0016663681282466057, + "compression/movement_sparsity/linear_layer_sparsity": 0.753914895021536, + "compression/movement_sparsity/model_sparsity": 0.7280155990640479, + "compression_loss": 81.43057250976562, + "distillation_loss": 4.118577480316162, + "epoch": 3.14, + "learning_rate": 3.809993425378041e-05, + "loss": 84.9717, + "step": 3717, + "task_loss": 2.585632562637329 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7623985037813562, + "compression/movement_sparsity/importance_threshold": -0.0016640978722534455, + "compression/movement_sparsity/linear_layer_sparsity": 0.7543006537687238, + "compression/movement_sparsity/model_sparsity": 0.728388105811537, + "compression_loss": 81.46473693847656, + "distillation_loss": 3.6276211738586426, + "epoch": 3.14, + "learning_rate": 3.809523809523809e-05, + "loss": 85.3466, + "step": 3718, + "task_loss": 2.4423177242279053 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7627223585703138, + "compression/movement_sparsity/importance_threshold": -0.001661829679191532, + "compression/movement_sparsity/linear_layer_sparsity": 0.7546235721524706, + "compression/movement_sparsity/model_sparsity": 0.728699930955389, + "compression_loss": 81.49889373779297, + "distillation_loss": 2.4993996620178223, + "epoch": 3.14, + "learning_rate": 3.8090541936695785e-05, + "loss": 85.2204, + "step": 3719, + "task_loss": 1.887010097503662 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7630459189458474, + "compression/movement_sparsity/importance_threshold": -0.0016595635481231677, + "compression/movement_sparsity/linear_layer_sparsity": 0.755037471935279, + "compression/movement_sparsity/model_sparsity": 0.7290996120073533, + "compression_loss": 81.5329360961914, + "distillation_loss": 3.0448527336120605, + "epoch": 3.14, + "learning_rate": 3.808584577815347e-05, + "loss": 84.4637, + "step": 3720, + "task_loss": 1.684658169746399 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7633691850418416, + "compression/movement_sparsity/importance_threshold": -0.0016572994781106597, + "compression/movement_sparsity/linear_layer_sparsity": 0.7554243992508952, + "compression/movement_sparsity/model_sparsity": 0.7294732471793504, + "compression_loss": 81.5670166015625, + "distillation_loss": 3.7425639629364014, + "epoch": 3.15, + "learning_rate": 3.8081149619611165e-05, + "loss": 85.2015, + "step": 3721, + "task_loss": 2.3294520378112793 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7636921569921822, + "compression/movement_sparsity/importance_threshold": -0.0016550374682163084, + "compression/movement_sparsity/linear_layer_sparsity": 0.7556088780483896, + "compression/movement_sparsity/model_sparsity": 0.7296513885626277, + "compression_loss": 81.6010513305664, + "distillation_loss": 2.205416202545166, + "epoch": 3.15, + "learning_rate": 3.8076453461068844e-05, + "loss": 84.5668, + "step": 3722, + "task_loss": 2.1741702556610107 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7640148349307536, + "compression/movement_sparsity/importance_threshold": -0.0016527775175024214, + "compression/movement_sparsity/linear_layer_sparsity": 0.7560314228527341, + "compression/movement_sparsity/model_sparsity": 0.730059417653043, + "compression_loss": 81.63509368896484, + "distillation_loss": 3.2796034812927246, + "epoch": 3.15, + "learning_rate": 3.807175730252653e-05, + "loss": 84.711, + "step": 3723, + "task_loss": 1.651004672050476 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7643372189914417, + "compression/movement_sparsity/importance_threshold": -0.0016505196250312989, + "compression/movement_sparsity/linear_layer_sparsity": 0.7563347617532228, + "compression/movement_sparsity/model_sparsity": 0.7303523359291204, + "compression_loss": 81.66900634765625, + "distillation_loss": 3.9813239574432373, + "epoch": 3.15, + "learning_rate": 3.8067061143984224e-05, + "loss": 85.4953, + "step": 3724, + "task_loss": 2.0276739597320557 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7646593093081311, + "compression/movement_sparsity/importance_threshold": -0.0016482637898652484, + "compression/movement_sparsity/linear_layer_sparsity": 0.7566772834685629, + "compression/movement_sparsity/model_sparsity": 0.7306830909698185, + "compression_loss": 81.70294952392578, + "distillation_loss": 2.271095037460327, + "epoch": 3.15, + "learning_rate": 3.806236498544191e-05, + "loss": 85.3375, + "step": 3725, + "task_loss": 1.721961498260498 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7649811060147073, + "compression/movement_sparsity/importance_threshold": -0.001646010011066571, + "compression/movement_sparsity/linear_layer_sparsity": 0.7568961038688486, + "compression/movement_sparsity/model_sparsity": 0.7308943942161842, + "compression_loss": 81.73682403564453, + "distillation_loss": 5.023978233337402, + "epoch": 3.15, + "learning_rate": 3.80576688268996e-05, + "loss": 85.2046, + "step": 3726, + "task_loss": 3.194732427597046 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.765302609245055, + "compression/movement_sparsity/importance_threshold": -0.0016437582876975736, + "compression/movement_sparsity/linear_layer_sparsity": 0.7572975190481424, + "compression/movement_sparsity/model_sparsity": 0.7312820195491716, + "compression_loss": 81.77066040039062, + "distillation_loss": 2.6423795223236084, + "epoch": 3.15, + "learning_rate": 3.805297266835728e-05, + "loss": 84.39, + "step": 3727, + "task_loss": 0.879217267036438 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7656238191330595, + "compression/movement_sparsity/importance_threshold": -0.0016415086188205598, + "compression/movement_sparsity/linear_layer_sparsity": 0.7576272461316093, + "compression/movement_sparsity/model_sparsity": 0.7316004194929623, + "compression_loss": 81.80448150634766, + "distillation_loss": 2.6968331336975098, + "epoch": 3.15, + "learning_rate": 3.8048276509814976e-05, + "loss": 85.3542, + "step": 3728, + "task_loss": 1.652719497680664 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7659447358126066, + "compression/movement_sparsity/importance_threshold": -0.0016392610034978283, + "compression/movement_sparsity/linear_layer_sparsity": 0.7578896374404364, + "compression/movement_sparsity/model_sparsity": 0.7318537968531211, + "compression_loss": 81.8382568359375, + "distillation_loss": 3.2174360752105713, + "epoch": 3.15, + "learning_rate": 3.804358035127266e-05, + "loss": 85.4225, + "step": 3729, + "task_loss": 1.9126152992248535 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7662653594175803, + "compression/movement_sparsity/importance_threshold": -0.0016370154407916908, + "compression/movement_sparsity/linear_layer_sparsity": 0.7582424616366139, + "compression/movement_sparsity/model_sparsity": 0.7321945004527458, + "compression_loss": 81.87201690673828, + "distillation_loss": 4.195473670959473, + "epoch": 3.15, + "learning_rate": 3.803888419273035e-05, + "loss": 84.9587, + "step": 3730, + "task_loss": 3.317570924758911 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7665856900818668, + "compression/movement_sparsity/importance_threshold": -0.0016347719297644444, + "compression/movement_sparsity/linear_layer_sparsity": 0.7586298301464326, + "compression/movement_sparsity/model_sparsity": 0.7325685616625672, + "compression_loss": 81.90574645996094, + "distillation_loss": 4.281726360321045, + "epoch": 3.15, + "learning_rate": 3.8034188034188035e-05, + "loss": 85.6519, + "step": 3731, + "task_loss": 2.209256649017334 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7669057279393504, + "compression/movement_sparsity/importance_threshold": -0.0016325304694783998, + "compression/movement_sparsity/linear_layer_sparsity": 0.7589294725549542, + "compression/movement_sparsity/model_sparsity": 0.7328579104325482, + "compression_loss": 81.9394302368164, + "distillation_loss": 4.009186267852783, + "epoch": 3.15, + "learning_rate": 3.802949187564572e-05, + "loss": 85.679, + "step": 3732, + "task_loss": 1.9952187538146973 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.767225473123917, + "compression/movement_sparsity/importance_threshold": -0.001630291058995854, + "compression/movement_sparsity/linear_layer_sparsity": 0.7593051196079867, + "compression/movement_sparsity/model_sparsity": 0.7332206528536837, + "compression_loss": 81.97311401367188, + "distillation_loss": 3.398517370223999, + "epoch": 3.16, + "learning_rate": 3.8024795717103414e-05, + "loss": 84.9304, + "step": 3733, + "task_loss": 1.4153525829315186 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7675449257694509, + "compression/movement_sparsity/importance_threshold": -0.001628053697379118, + "compression/movement_sparsity/linear_layer_sparsity": 0.759611880744587, + "compression/movement_sparsity/model_sparsity": 0.733516875801534, + "compression_loss": 82.00682067871094, + "distillation_loss": 3.0639572143554688, + "epoch": 3.16, + "learning_rate": 3.80200995585611e-05, + "loss": 85.4746, + "step": 3734, + "task_loss": 2.5014772415161133 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.767864086009838, + "compression/movement_sparsity/importance_threshold": -0.0016258183836904893, + "compression/movement_sparsity/linear_layer_sparsity": 0.7599700588920331, + "compression/movement_sparsity/model_sparsity": 0.7338627494277304, + "compression_loss": 82.04039001464844, + "distillation_loss": 4.306668281555176, + "epoch": 3.16, + "learning_rate": 3.801540340001879e-05, + "loss": 85.4426, + "step": 3735, + "task_loss": 2.2235805988311768 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.768182953978963, + "compression/movement_sparsity/importance_threshold": -0.0016235851169922773, + "compression/movement_sparsity/linear_layer_sparsity": 0.7603497482378943, + "compression/movement_sparsity/model_sparsity": 0.7342293952765002, + "compression_loss": 82.07396697998047, + "distillation_loss": 2.787050724029541, + "epoch": 3.16, + "learning_rate": 3.8010707241476473e-05, + "loss": 85.4374, + "step": 3736, + "task_loss": 1.996794581413269 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7685015298107113, + "compression/movement_sparsity/importance_threshold": -0.0016213538963467805, + "compression/movement_sparsity/linear_layer_sparsity": 0.7607166310277145, + "compression/movement_sparsity/model_sparsity": 0.7345836745138267, + "compression_loss": 82.10758972167969, + "distillation_loss": 2.326565980911255, + "epoch": 3.16, + "learning_rate": 3.800601108293416e-05, + "loss": 85.4147, + "step": 3737, + "task_loss": 1.564960241317749 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7688198136389677, + "compression/movement_sparsity/importance_threshold": -0.001619124720816309, + "compression/movement_sparsity/linear_layer_sparsity": 0.7610723885691304, + "compression/movement_sparsity/model_sparsity": 0.7349272106892568, + "compression_loss": 82.14110565185547, + "distillation_loss": 3.0554215908050537, + "epoch": 3.16, + "learning_rate": 3.800131492439185e-05, + "loss": 85.5256, + "step": 3738, + "task_loss": 2.169434070587158 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7691378055976175, + "compression/movement_sparsity/importance_threshold": -0.0016168975894631631, + "compression/movement_sparsity/linear_layer_sparsity": 0.7615073464319838, + "compression/movement_sparsity/model_sparsity": 0.7353472264114342, + "compression_loss": 82.17452239990234, + "distillation_loss": 3.0137085914611816, + "epoch": 3.16, + "learning_rate": 3.799661876584953e-05, + "loss": 85.9602, + "step": 3739, + "task_loss": 2.209989547729492 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7694555058205462, + "compression/movement_sparsity/importance_threshold": -0.0016146725013496451, + "compression/movement_sparsity/linear_layer_sparsity": 0.7618246604569417, + "compression/movement_sparsity/model_sparsity": 0.7356536397234628, + "compression_loss": 82.20801544189453, + "distillation_loss": 4.140649795532227, + "epoch": 3.16, + "learning_rate": 3.7991922607307226e-05, + "loss": 86.2876, + "step": 3740, + "task_loss": 3.157860040664673 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7697729144416383, + "compression/movement_sparsity/importance_threshold": -0.001612449455538064, + "compression/movement_sparsity/linear_layer_sparsity": 0.7621022312311694, + "compression/movement_sparsity/model_sparsity": 0.7359216750876881, + "compression_loss": 82.24140930175781, + "distillation_loss": 3.3894877433776855, + "epoch": 3.16, + "learning_rate": 3.798722644876491e-05, + "loss": 85.6449, + "step": 3741, + "task_loss": 1.4611133337020874 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7700900315947796, + "compression/movement_sparsity/importance_threshold": -0.0016102284510907195, + "compression/movement_sparsity/linear_layer_sparsity": 0.7623060629527368, + "compression/movement_sparsity/model_sparsity": 0.73611850456256, + "compression_loss": 82.2748031616211, + "distillation_loss": 3.5760154724121094, + "epoch": 3.16, + "learning_rate": 3.79825302902226e-05, + "loss": 86.1086, + "step": 3742, + "task_loss": 2.121083974838257 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7704068574138547, + "compression/movement_sparsity/importance_threshold": -0.001608009487069917, + "compression/movement_sparsity/linear_layer_sparsity": 0.7626130506485221, + "compression/movement_sparsity/model_sparsity": 0.7364149462865904, + "compression_loss": 82.30810546875, + "distillation_loss": 4.571725845336914, + "epoch": 3.16, + "learning_rate": 3.797783413168029e-05, + "loss": 86.5255, + "step": 3743, + "task_loss": 2.3491389751434326 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7707233920327491, + "compression/movement_sparsity/importance_threshold": -0.0016057925625379606, + "compression/movement_sparsity/linear_layer_sparsity": 0.7629625480019292, + "compression/movement_sparsity/model_sparsity": 0.7367524373307284, + "compression_loss": 82.3414077758789, + "distillation_loss": 3.604790449142456, + "epoch": 3.16, + "learning_rate": 3.797313797313797e-05, + "loss": 86.1671, + "step": 3744, + "task_loss": 2.0918924808502197 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7710396355853476, + "compression/movement_sparsity/importance_threshold": -0.001603577676557156, + "compression/movement_sparsity/linear_layer_sparsity": 0.7633271890482339, + "compression/movement_sparsity/model_sparsity": 0.7371045518353254, + "compression_loss": 82.37469482421875, + "distillation_loss": 4.50681734085083, + "epoch": 3.17, + "learning_rate": 3.7968441814595664e-05, + "loss": 85.9067, + "step": 3745, + "task_loss": 2.2871451377868652 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7713555882055358, + "compression/movement_sparsity/importance_threshold": -0.0016013648281898024, + "compression/movement_sparsity/linear_layer_sparsity": 0.7636456835657879, + "compression/movement_sparsity/model_sparsity": 0.7374121050863977, + "compression_loss": 82.4079360961914, + "distillation_loss": 3.3302059173583984, + "epoch": 3.17, + "learning_rate": 3.796374565605335e-05, + "loss": 85.8246, + "step": 3746, + "task_loss": 2.2445249557495117 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7716712500271983, + "compression/movement_sparsity/importance_threshold": -0.0015991540164982092, + "compression/movement_sparsity/linear_layer_sparsity": 0.7640252775183078, + "compression/movement_sparsity/model_sparsity": 0.737778658818881, + "compression_loss": 82.44114685058594, + "distillation_loss": 2.938812494277954, + "epoch": 3.17, + "learning_rate": 3.795904949751104e-05, + "loss": 85.9171, + "step": 3747, + "task_loss": 1.5680830478668213 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7719866211842208, + "compression/movement_sparsity/importance_threshold": -0.001596945240544676, + "compression/movement_sparsity/linear_layer_sparsity": 0.7642387201189899, + "compression/movement_sparsity/model_sparsity": 0.7379847690096033, + "compression_loss": 82.4743423461914, + "distillation_loss": 2.9713926315307617, + "epoch": 3.17, + "learning_rate": 3.795435333896872e-05, + "loss": 85.7899, + "step": 3748, + "task_loss": 1.6404054164886475 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7723017018104879, + "compression/movement_sparsity/importance_threshold": -0.0015947384993915108, + "compression/movement_sparsity/linear_layer_sparsity": 0.7644708836628601, + "compression/movement_sparsity/model_sparsity": 0.738208957021523, + "compression_loss": 82.50749969482422, + "distillation_loss": 3.577488660812378, + "epoch": 3.17, + "learning_rate": 3.794965718042641e-05, + "loss": 85.723, + "step": 3749, + "task_loss": 2.3458402156829834 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7726164920398849, + "compression/movement_sparsity/importance_threshold": -0.001592533792101016, + "compression/movement_sparsity/linear_layer_sparsity": 0.7648701167194765, + "compression/movement_sparsity/model_sparsity": 0.7385944751944601, + "compression_loss": 82.54058074951172, + "distillation_loss": 3.8069372177124023, + "epoch": 3.17, + "learning_rate": 3.79449610218841e-05, + "loss": 85.9233, + "step": 3750, + "task_loss": 1.3519659042358398 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7729309920062974, + "compression/movement_sparsity/importance_threshold": -0.0015903311177354919, + "compression/movement_sparsity/linear_layer_sparsity": 0.765081639529169, + "compression/movement_sparsity/model_sparsity": 0.7387987315449195, + "compression_loss": 82.57363891601562, + "distillation_loss": 3.5107202529907227, + "epoch": 3.17, + "learning_rate": 3.794026486334179e-05, + "loss": 86.1822, + "step": 3751, + "task_loss": 2.120448350906372 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7732452018436099, + "compression/movement_sparsity/importance_threshold": -0.0015881304753572485, + "compression/movement_sparsity/linear_layer_sparsity": 0.7652966441958112, + "compression/movement_sparsity/model_sparsity": 0.7390063501398308, + "compression_loss": 82.60675811767578, + "distillation_loss": 3.8247909545898438, + "epoch": 3.17, + "learning_rate": 3.7935568704799475e-05, + "loss": 86.0003, + "step": 3752, + "task_loss": 1.4965037107467651 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7735591216857081, + "compression/movement_sparsity/importance_threshold": -0.0015859318640285853, + "compression/movement_sparsity/linear_layer_sparsity": 0.7655352110177019, + "compression/movement_sparsity/model_sparsity": 0.7392367214574722, + "compression_loss": 82.63980865478516, + "distillation_loss": 3.484588623046875, + "epoch": 3.17, + "learning_rate": 3.793087254625716e-05, + "loss": 87.1008, + "step": 3753, + "task_loss": 1.8741004467010498 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7738727516664765, + "compression/movement_sparsity/importance_threshold": -0.0015837352828118097, + "compression/movement_sparsity/linear_layer_sparsity": 0.7657559154364741, + "compression/movement_sparsity/model_sparsity": 0.7394498440004933, + "compression_loss": 82.67288970947266, + "distillation_loss": 4.289256572723389, + "epoch": 3.17, + "learning_rate": 3.792617638771485e-05, + "loss": 86.6443, + "step": 3754, + "task_loss": 2.6934778690338135 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7741860919198009, + "compression/movement_sparsity/importance_threshold": -0.0015815407307692222, + "compression/movement_sparsity/linear_layer_sparsity": 0.7661891085225174, + "compression/movement_sparsity/model_sparsity": 0.7398681555713732, + "compression_loss": 82.7059097290039, + "distillation_loss": 4.224939823150635, + "epoch": 3.17, + "learning_rate": 3.792148022917254e-05, + "loss": 87.2242, + "step": 3755, + "task_loss": 2.5787277221679688 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.774499142579566, + "compression/movement_sparsity/importance_threshold": -0.0015793482069631293, + "compression/movement_sparsity/linear_layer_sparsity": 0.766576512804839, + "compression/movement_sparsity/model_sparsity": 0.740242251324802, + "compression_loss": 82.7389144897461, + "distillation_loss": 4.364882469177246, + "epoch": 3.17, + "learning_rate": 3.791678407063022e-05, + "loss": 86.2716, + "step": 3756, + "task_loss": 2.642099380493164 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7748119037796571, + "compression/movement_sparsity/importance_threshold": -0.001577157710455834, + "compression/movement_sparsity/linear_layer_sparsity": 0.7670453233796105, + "compression/movement_sparsity/model_sparsity": 0.7406949568141002, + "compression_loss": 82.77190399169922, + "distillation_loss": 3.3080596923828125, + "epoch": 3.18, + "learning_rate": 3.7912087912087914e-05, + "loss": 87.1048, + "step": 3757, + "task_loss": 1.4967310428619385 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7751243756539593, + "compression/movement_sparsity/importance_threshold": -0.001574969240309641, + "compression/movement_sparsity/linear_layer_sparsity": 0.7673615284569784, + "compression/movement_sparsity/model_sparsity": 0.7410002992742999, + "compression_loss": 82.80481719970703, + "distillation_loss": 3.5971250534057617, + "epoch": 3.18, + "learning_rate": 3.79073917535456e-05, + "loss": 86.7371, + "step": 3758, + "task_loss": 1.5292072296142578 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7754365583363576, + "compression/movement_sparsity/importance_threshold": -0.001572782795586855, + "compression/movement_sparsity/linear_layer_sparsity": 0.7675814697290219, + "compression/movement_sparsity/model_sparsity": 0.7412126848870302, + "compression_loss": 82.83771514892578, + "distillation_loss": 4.425161361694336, + "epoch": 3.18, + "learning_rate": 3.790269559500329e-05, + "loss": 86.6261, + "step": 3759, + "task_loss": 2.2625224590301514 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7757484519607377, + "compression/movement_sparsity/importance_threshold": -0.001570598375349775, + "compression/movement_sparsity/linear_layer_sparsity": 0.767750864454457, + "compression/movement_sparsity/model_sparsity": 0.7413762603825274, + "compression_loss": 82.8706283569336, + "distillation_loss": 3.4967360496520996, + "epoch": 3.18, + "learning_rate": 3.789799943646098e-05, + "loss": 86.4596, + "step": 3760, + "task_loss": 2.3827271461486816 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.776060056660984, + "compression/movement_sparsity/importance_threshold": -0.0015684159786607125, + "compression/movement_sparsity/linear_layer_sparsity": 0.768040633652176, + "compression/movement_sparsity/model_sparsity": 0.7416560751168705, + "compression_loss": 82.90339660644531, + "distillation_loss": 3.4624850749969482, + "epoch": 3.18, + "learning_rate": 3.7893303277918666e-05, + "loss": 86.6068, + "step": 3761, + "task_loss": 1.4559991359710693 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7763713725709822, + "compression/movement_sparsity/importance_threshold": -0.0015662356045819652, + "compression/movement_sparsity/linear_layer_sparsity": 0.7683057794437271, + "compression/movement_sparsity/model_sparsity": 0.7419121123347979, + "compression_loss": 82.93623352050781, + "distillation_loss": 2.6669912338256836, + "epoch": 3.18, + "learning_rate": 3.788860711937635e-05, + "loss": 86.1814, + "step": 3762, + "task_loss": 1.1519489288330078 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7766823998246171, + "compression/movement_sparsity/importance_threshold": -0.0015640572521758407, + "compression/movement_sparsity/linear_layer_sparsity": 0.7685119483021512, + "compression/movement_sparsity/model_sparsity": 0.7421111986586856, + "compression_loss": 82.96902465820312, + "distillation_loss": 2.851030111312866, + "epoch": 3.18, + "learning_rate": 3.788391096083404e-05, + "loss": 85.6626, + "step": 3763, + "task_loss": 2.0943679809570312 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7769931385557741, + "compression/movement_sparsity/importance_threshold": -0.0015618809205046403, + "compression/movement_sparsity/linear_layer_sparsity": 0.768793048629999, + "compression/movement_sparsity/model_sparsity": 0.742382642325506, + "compression_loss": 83.00176239013672, + "distillation_loss": 3.6951303482055664, + "epoch": 3.18, + "learning_rate": 3.787921480229173e-05, + "loss": 86.9678, + "step": 3764, + "task_loss": 2.1145670413970947 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7773035888983381, + "compression/movement_sparsity/importance_threshold": -0.0015597066086306711, + "compression/movement_sparsity/linear_layer_sparsity": 0.7690655277846461, + "compression/movement_sparsity/model_sparsity": 0.7426457609829471, + "compression_loss": 83.03450012207031, + "distillation_loss": 5.087008476257324, + "epoch": 3.18, + "learning_rate": 3.787451864374941e-05, + "loss": 86.9128, + "step": 3765, + "task_loss": 2.8062498569488525 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7776137509861945, + "compression/movement_sparsity/importance_threshold": -0.0015575343156162347, + "compression/movement_sparsity/linear_layer_sparsity": 0.7693148144332415, + "compression/movement_sparsity/model_sparsity": 0.7428864838682677, + "compression_loss": 83.06720733642578, + "distillation_loss": 4.947211742401123, + "epoch": 3.18, + "learning_rate": 3.7869822485207104e-05, + "loss": 87.0079, + "step": 3766, + "task_loss": 3.5668604373931885 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.777923624953228, + "compression/movement_sparsity/importance_threshold": -0.0015553640405236374, + "compression/movement_sparsity/linear_layer_sparsity": 0.7695673563796014, + "compression/movement_sparsity/model_sparsity": 0.7431303502218604, + "compression_loss": 83.09982299804688, + "distillation_loss": 3.4829301834106445, + "epoch": 3.18, + "learning_rate": 3.786512632666479e-05, + "loss": 86.6857, + "step": 3767, + "task_loss": 1.8114540576934814 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7782332109333243, + "compression/movement_sparsity/importance_threshold": -0.001553195782415179, + "compression/movement_sparsity/linear_layer_sparsity": 0.7699054184562457, + "compression/movement_sparsity/model_sparsity": 0.7434567988261713, + "compression_loss": 83.13243865966797, + "distillation_loss": 3.691450834274292, + "epoch": 3.19, + "learning_rate": 3.786043016812248e-05, + "loss": 87.2559, + "step": 3768, + "task_loss": 3.0252506732940674 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.778542509060368, + "compression/movement_sparsity/importance_threshold": -0.001551029540353169, + "compression/movement_sparsity/linear_layer_sparsity": 0.7703197713574244, + "compression/movement_sparsity/model_sparsity": 0.7438569174304959, + "compression_loss": 83.16503143310547, + "distillation_loss": 3.975186824798584, + "epoch": 3.19, + "learning_rate": 3.785573400958016e-05, + "loss": 86.9055, + "step": 3769, + "task_loss": 1.3500369787216187 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7788515194682446, + "compression/movement_sparsity/importance_threshold": -0.0015488653133999076, + "compression/movement_sparsity/linear_layer_sparsity": 0.7706234560587745, + "compression/movement_sparsity/model_sparsity": 0.7441501696281112, + "compression_loss": 83.19757080078125, + "distillation_loss": 3.2845211029052734, + "epoch": 3.19, + "learning_rate": 3.785103785103785e-05, + "loss": 86.6716, + "step": 3770, + "task_loss": 1.1226887702941895 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7791602422908394, + "compression/movement_sparsity/importance_threshold": -0.0015467031006176982, + "compression/movement_sparsity/linear_layer_sparsity": 0.7708980219427579, + "compression/movement_sparsity/model_sparsity": 0.7444153033293164, + "compression_loss": 83.23015594482422, + "distillation_loss": 4.058924198150635, + "epoch": 3.19, + "learning_rate": 3.784634169249554e-05, + "loss": 87.3424, + "step": 3771, + "task_loss": 3.3591055870056152 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.779468677662037, + "compression/movement_sparsity/importance_threshold": -0.0015445429010688483, + "compression/movement_sparsity/linear_layer_sparsity": 0.7712287387321386, + "compression/movement_sparsity/model_sparsity": 0.7447346589795779, + "compression_loss": 83.26264953613281, + "distillation_loss": 2.798048496246338, + "epoch": 3.19, + "learning_rate": 3.784164553395323e-05, + "loss": 86.5358, + "step": 3772, + "task_loss": 1.4800227880477905 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.779776825715723, + "compression/movement_sparsity/importance_threshold": -0.0015423847138156576, + "compression/movement_sparsity/linear_layer_sparsity": 0.7714439580337983, + "compression/movement_sparsity/model_sparsity": 0.7449424848361336, + "compression_loss": 83.29512023925781, + "distillation_loss": 3.860445022583008, + "epoch": 3.19, + "learning_rate": 3.7836949375410915e-05, + "loss": 87.6684, + "step": 3773, + "task_loss": 2.187854051589966 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7800846865857822, + "compression/movement_sparsity/importance_threshold": -0.001540228537920435, + "compression/movement_sparsity/linear_layer_sparsity": 0.77163324227085, + "compression/movement_sparsity/model_sparsity": 0.7451252665773361, + "compression_loss": 83.32754516601562, + "distillation_loss": 3.4418082237243652, + "epoch": 3.19, + "learning_rate": 3.78322532168686e-05, + "loss": 87.0282, + "step": 3774, + "task_loss": 2.210050344467163 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7803922604061002, + "compression/movement_sparsity/importance_threshold": -0.0015380743724454794, + "compression/movement_sparsity/linear_layer_sparsity": 0.7719419947434454, + "compression/movement_sparsity/model_sparsity": 0.7454234124526642, + "compression_loss": 83.35993957519531, + "distillation_loss": 3.5467007160186768, + "epoch": 3.19, + "learning_rate": 3.782755705832629e-05, + "loss": 87.4188, + "step": 3775, + "task_loss": 1.9753005504608154 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7806995473105616, + "compression/movement_sparsity/importance_threshold": -0.0015359222164530988, + "compression/movement_sparsity/linear_layer_sparsity": 0.772313086764441, + "compression/movement_sparsity/model_sparsity": 0.7457817563211261, + "compression_loss": 83.39227294921875, + "distillation_loss": 3.7774386405944824, + "epoch": 3.19, + "learning_rate": 3.782286089978398e-05, + "loss": 86.5792, + "step": 3776, + "task_loss": 1.415449857711792 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7810065474330519, + "compression/movement_sparsity/importance_threshold": -0.0015337720690055947, + "compression/movement_sparsity/linear_layer_sparsity": 0.7727213941126283, + "compression/movement_sparsity/model_sparsity": 0.7461760370558027, + "compression_loss": 83.4245834350586, + "distillation_loss": 4.428663730621338, + "epoch": 3.19, + "learning_rate": 3.781816474124167e-05, + "loss": 87.4608, + "step": 3777, + "task_loss": 3.0786337852478027 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.781313260907456, + "compression/movement_sparsity/importance_threshold": -0.0015316239291652727, + "compression/movement_sparsity/linear_layer_sparsity": 0.7731110878351359, + "compression/movement_sparsity/model_sparsity": 0.7465523436001041, + "compression_loss": 83.45686340332031, + "distillation_loss": 5.7508673667907715, + "epoch": 3.19, + "learning_rate": 3.7813468582699354e-05, + "loss": 86.99, + "step": 3778, + "task_loss": 3.0719847679138184 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7816196878676595, + "compression/movement_sparsity/importance_threshold": -0.001529477795994434, + "compression/movement_sparsity/linear_layer_sparsity": 0.7734718654511266, + "compression/movement_sparsity/model_sparsity": 0.7469007273951037, + "compression_loss": 83.48909759521484, + "distillation_loss": 3.214224100112915, + "epoch": 3.19, + "learning_rate": 3.780877242415704e-05, + "loss": 86.8352, + "step": 3779, + "task_loss": 1.7268993854522705 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7819258284475468, + "compression/movement_sparsity/importance_threshold": -0.0015273336685553878, + "compression/movement_sparsity/linear_layer_sparsity": 0.7737915046887734, + "compression/movement_sparsity/model_sparsity": 0.7472093860416122, + "compression_loss": 83.52133178710938, + "distillation_loss": 3.756901741027832, + "epoch": 3.2, + "learning_rate": 3.7804076265614727e-05, + "loss": 87.2606, + "step": 3780, + "task_loss": 3.4098353385925293 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7822316827810035, + "compression/movement_sparsity/importance_threshold": -0.0015251915459104336, + "compression/movement_sparsity/linear_layer_sparsity": 0.7741004479480511, + "compression/movement_sparsity/model_sparsity": 0.747507716149513, + "compression_loss": 83.55345153808594, + "distillation_loss": 4.934846878051758, + "epoch": 3.2, + "learning_rate": 3.779938010707242e-05, + "loss": 87.5125, + "step": 3781, + "task_loss": 3.315051555633545 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7825372510019151, + "compression/movement_sparsity/importance_threshold": -0.0015230514271218753, + "compression/movement_sparsity/linear_layer_sparsity": 0.7744260611936836, + "compression/movement_sparsity/model_sparsity": 0.7478221435784546, + "compression_loss": 83.58560180664062, + "distillation_loss": 2.9160609245300293, + "epoch": 3.2, + "learning_rate": 3.77946839485301e-05, + "loss": 87.2105, + "step": 3782, + "task_loss": 1.9747040271759033 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.782842533244166, + "compression/movement_sparsity/importance_threshold": -0.0015209133112520194, + "compression/movement_sparsity/linear_layer_sparsity": 0.7747626446735411, + "compression/movement_sparsity/model_sparsity": 0.748147164380327, + "compression_loss": 83.61772918701172, + "distillation_loss": 3.6073789596557617, + "epoch": 3.2, + "learning_rate": 3.778998778998779e-05, + "loss": 87.1379, + "step": 3783, + "task_loss": 2.035456657409668 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7831475296416418, + "compression/movement_sparsity/importance_threshold": -0.0015187771973631681, + "compression/movement_sparsity/linear_layer_sparsity": 0.7751337843912073, + "compression/movement_sparsity/model_sparsity": 0.7485055543069321, + "compression_loss": 83.64981842041016, + "distillation_loss": 4.648983955383301, + "epoch": 3.2, + "learning_rate": 3.778529163144548e-05, + "loss": 87.2781, + "step": 3784, + "task_loss": 1.729897141456604 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7834522403282274, + "compression/movement_sparsity/importance_threshold": -0.001516643084517628, + "compression/movement_sparsity/linear_layer_sparsity": 0.7753683685411077, + "compression/movement_sparsity/model_sparsity": 0.7487320797696181, + "compression_loss": 83.68184661865234, + "distillation_loss": 5.057370185852051, + "epoch": 3.2, + "learning_rate": 3.778059547290317e-05, + "loss": 87.6817, + "step": 3785, + "task_loss": 2.5154778957366943 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7837566654378082, + "compression/movement_sparsity/importance_threshold": -0.0015145109717776984, + "compression/movement_sparsity/linear_layer_sparsity": 0.7756306048357555, + "compression/movement_sparsity/model_sparsity": 0.7489853074408116, + "compression_loss": 83.71393585205078, + "distillation_loss": 3.712498903274536, + "epoch": 3.2, + "learning_rate": 3.777589931436085e-05, + "loss": 87.3655, + "step": 3786, + "task_loss": 1.7156929969787598 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.784060805104269, + "compression/movement_sparsity/importance_threshold": -0.0015123808582056877, + "compression/movement_sparsity/linear_layer_sparsity": 0.7759069951173873, + "compression/movement_sparsity/model_sparsity": 0.7492522028659933, + "compression_loss": 83.74591827392578, + "distillation_loss": 3.684849977493286, + "epoch": 3.2, + "learning_rate": 3.777120315581854e-05, + "loss": 86.9913, + "step": 3787, + "task_loss": 2.582385540008545 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7843646594614955, + "compression/movement_sparsity/importance_threshold": -0.0015102527428638964, + "compression/movement_sparsity/linear_layer_sparsity": 0.7762349454998764, + "compression/movement_sparsity/model_sparsity": 0.7495688871439505, + "compression_loss": 83.77786254882812, + "distillation_loss": 4.3996686935424805, + "epoch": 3.2, + "learning_rate": 3.776650699727623e-05, + "loss": 87.319, + "step": 3788, + "task_loss": 1.9721534252166748 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7846682286433722, + "compression/movement_sparsity/importance_threshold": -0.0015081266248146317, + "compression/movement_sparsity/linear_layer_sparsity": 0.7764971698703567, + "compression/movement_sparsity/model_sparsity": 0.7498221033006083, + "compression_loss": 83.809814453125, + "distillation_loss": 4.153657913208008, + "epoch": 3.2, + "learning_rate": 3.776181083873392e-05, + "loss": 86.8527, + "step": 3789, + "task_loss": 2.619418144226074 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7849715127837844, + "compression/movement_sparsity/importance_threshold": -0.0015060025031201968, + "compression/movement_sparsity/linear_layer_sparsity": 0.7766971262374425, + "compression/movement_sparsity/model_sparsity": 0.7500151905513469, + "compression_loss": 83.84172821044922, + "distillation_loss": 3.0294246673583984, + "epoch": 3.2, + "learning_rate": 3.775711468019161e-05, + "loss": 87.1422, + "step": 3790, + "task_loss": 0.7556239366531372 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7852745120166177, + "compression/movement_sparsity/importance_threshold": -0.0015038803768428937, + "compression/movement_sparsity/linear_layer_sparsity": 0.7770216543838201, + "compression/movement_sparsity/model_sparsity": 0.7503285701575312, + "compression_loss": 83.87360382080078, + "distillation_loss": 2.9226419925689697, + "epoch": 3.2, + "learning_rate": 3.775241852164929e-05, + "loss": 87.0064, + "step": 3791, + "task_loss": 1.9306540489196777 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7855772264757566, + "compression/movement_sparsity/importance_threshold": -0.001501760245045029, + "compression/movement_sparsity/linear_layer_sparsity": 0.7773739300682864, + "compression/movement_sparsity/model_sparsity": 0.7506687440885093, + "compression_loss": 83.90543365478516, + "distillation_loss": 3.404066562652588, + "epoch": 3.21, + "learning_rate": 3.774772236310698e-05, + "loss": 87.303, + "step": 3792, + "task_loss": 1.1482508182525635 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7858796562950868, + "compression/movement_sparsity/importance_threshold": -0.001499642106788903, + "compression/movement_sparsity/linear_layer_sparsity": 0.7776549469269608, + "compression/movement_sparsity/model_sparsity": 0.7509401071535793, + "compression_loss": 83.9372787475586, + "distillation_loss": 2.883958101272583, + "epoch": 3.21, + "learning_rate": 3.774302620456467e-05, + "loss": 87.1241, + "step": 3793, + "task_loss": 1.0248199701309204 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.786181801608493, + "compression/movement_sparsity/importance_threshold": -0.0014975259611368233, + "compression/movement_sparsity/linear_layer_sparsity": 0.7778897814843815, + "compression/movement_sparsity/model_sparsity": 0.751166874421517, + "compression_loss": 83.96902465820312, + "distillation_loss": 2.95987606048584, + "epoch": 3.21, + "learning_rate": 3.7738330046022356e-05, + "loss": 87.3736, + "step": 3794, + "task_loss": 1.4281424283981323 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7864836625498608, + "compression/movement_sparsity/importance_threshold": -0.001495411807151091, + "compression/movement_sparsity/linear_layer_sparsity": 0.7780700868232033, + "compression/movement_sparsity/model_sparsity": 0.7513409857172662, + "compression_loss": 84.00077819824219, + "distillation_loss": 1.937713861465454, + "epoch": 3.21, + "learning_rate": 3.773363388748004e-05, + "loss": 86.772, + "step": 3795, + "task_loss": 1.7503010034561157 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7867852392530748, + "compression/movement_sparsity/importance_threshold": -0.0014932996438940145, + "compression/movement_sparsity/linear_layer_sparsity": 0.7783556587139147, + "compression/movement_sparsity/model_sparsity": 0.7516167473350096, + "compression_loss": 84.03253936767578, + "distillation_loss": 2.5877861976623535, + "epoch": 3.21, + "learning_rate": 3.772893772893773e-05, + "loss": 87.2933, + "step": 3796, + "task_loss": 1.5247256755828857 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7870865318520205, + "compression/movement_sparsity/importance_threshold": -0.0014911894704278932, + "compression/movement_sparsity/linear_layer_sparsity": 0.7786095600153851, + "compression/movement_sparsity/model_sparsity": 0.7518619263456828, + "compression_loss": 84.06420135498047, + "distillation_loss": 3.9439549446105957, + "epoch": 3.21, + "learning_rate": 3.772424157039542e-05, + "loss": 87.6646, + "step": 3797, + "task_loss": 1.7856820821762085 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7873875404805828, + "compression/movement_sparsity/importance_threshold": -0.0014890812858150337, + "compression/movement_sparsity/linear_layer_sparsity": 0.7789716015931452, + "compression/movement_sparsity/model_sparsity": 0.7522115306814766, + "compression_loss": 84.09585571289062, + "distillation_loss": 4.824485778808594, + "epoch": 3.21, + "learning_rate": 3.771954541185311e-05, + "loss": 87.8309, + "step": 3798, + "task_loss": 1.8355602025985718 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.787688265272647, + "compression/movement_sparsity/importance_threshold": -0.0014869750891177399, + "compression/movement_sparsity/linear_layer_sparsity": 0.7791246125122485, + "compression/movement_sparsity/model_sparsity": 0.7523592852047922, + "compression_loss": 84.12753295898438, + "distillation_loss": 4.292254447937012, + "epoch": 3.21, + "learning_rate": 3.7714849253310794e-05, + "loss": 88.0214, + "step": 3799, + "task_loss": 1.7683204412460327 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7879887063620985, + "compression/movement_sparsity/importance_threshold": -0.001484870879398312, + "compression/movement_sparsity/linear_layer_sparsity": 0.7794336273165319, + "compression/movement_sparsity/model_sparsity": 0.7526576843999077, + "compression_loss": 84.15913391113281, + "distillation_loss": 4.301012992858887, + "epoch": 3.21, + "learning_rate": 3.771015309476848e-05, + "loss": 88.1024, + "step": 3800, + "task_loss": 2.6688618659973145 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7882888638828219, + "compression/movement_sparsity/importance_threshold": -0.0014827686557190595, + "compression/movement_sparsity/linear_layer_sparsity": 0.779741998215763, + "compression/movement_sparsity/model_sparsity": 0.7529554618100903, + "compression_loss": 84.19073486328125, + "distillation_loss": 2.6608834266662598, + "epoch": 3.21, + "learning_rate": 3.770545693622617e-05, + "loss": 87.1059, + "step": 3801, + "task_loss": 1.426790475845337 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7885887379687029, + "compression/movement_sparsity/importance_threshold": -0.0014806684171422817, + "compression/movement_sparsity/linear_layer_sparsity": 0.7800697101148993, + "compression/movement_sparsity/model_sparsity": 0.7532719157973317, + "compression_loss": 84.22232055664062, + "distillation_loss": 2.390174627304077, + "epoch": 3.21, + "learning_rate": 3.770076077768386e-05, + "loss": 87.7149, + "step": 3802, + "task_loss": 2.257765531539917 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.788888328753626, + "compression/movement_sparsity/importance_threshold": -0.001478570162730287, + "compression/movement_sparsity/linear_layer_sparsity": 0.7803571421757618, + "compression/movement_sparsity/model_sparsity": 0.7535494736826591, + "compression_loss": 84.25386810302734, + "distillation_loss": 3.9160256385803223, + "epoch": 3.21, + "learning_rate": 3.7696064619141546e-05, + "loss": 87.5584, + "step": 3803, + "task_loss": 1.6712257862091064 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7891876363714769, + "compression/movement_sparsity/importance_threshold": -0.0014764738915453756, + "compression/movement_sparsity/linear_layer_sparsity": 0.7806464821034461, + "compression/movement_sparsity/model_sparsity": 0.7538288738937137, + "compression_loss": 84.28531646728516, + "distillation_loss": 4.057296276092529, + "epoch": 3.22, + "learning_rate": 3.769136846059923e-05, + "loss": 88.2353, + "step": 3804, + "task_loss": 2.9025015830993652 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7894866609561405, + "compression/movement_sparsity/importance_threshold": -0.0014743796026498541, + "compression/movement_sparsity/linear_layer_sparsity": 0.7810882725143548, + "compression/movement_sparsity/model_sparsity": 0.7542554874449013, + "compression_loss": 84.31680297851562, + "distillation_loss": 3.3613243103027344, + "epoch": 3.22, + "learning_rate": 3.768667230205692e-05, + "loss": 87.9363, + "step": 3805, + "task_loss": 1.6264674663543701 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7897854026415021, + "compression/movement_sparsity/importance_threshold": -0.001472287295106024, + "compression/movement_sparsity/linear_layer_sparsity": 0.7813264696870488, + "compression/movement_sparsity/model_sparsity": 0.7544855018119331, + "compression_loss": 84.34823608398438, + "distillation_loss": 3.5399203300476074, + "epoch": 3.22, + "learning_rate": 3.7681976143514605e-05, + "loss": 87.6794, + "step": 3806, + "task_loss": 1.5807515382766724 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7900838615614465, + "compression/movement_sparsity/importance_threshold": -0.0014701969679761914, + "compression/movement_sparsity/linear_layer_sparsity": 0.7816431278827868, + "compression/movement_sparsity/model_sparsity": 0.754791281824493, + "compression_loss": 84.37964630126953, + "distillation_loss": 5.519688606262207, + "epoch": 3.22, + "learning_rate": 3.76772799849723e-05, + "loss": 87.9776, + "step": 3807, + "task_loss": 2.925546407699585 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7903820378498592, + "compression/movement_sparsity/importance_threshold": -0.0014681086203226588, + "compression/movement_sparsity/linear_layer_sparsity": 0.7820526872685758, + "compression/movement_sparsity/model_sparsity": 0.755186771585428, + "compression_loss": 84.41105651855469, + "distillation_loss": 2.308300018310547, + "epoch": 3.22, + "learning_rate": 3.767258382642998e-05, + "loss": 88.5638, + "step": 3808, + "task_loss": 1.2355440855026245 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.790679931640625, + "compression/movement_sparsity/importance_threshold": -0.0014660222512077326, + "compression/movement_sparsity/linear_layer_sparsity": 0.7822079876278653, + "compression/movement_sparsity/model_sparsity": 0.7553367368996161, + "compression_loss": 84.4424057006836, + "distillation_loss": 4.087986946105957, + "epoch": 3.22, + "learning_rate": 3.766788766788767e-05, + "loss": 88.5734, + "step": 3809, + "task_loss": 2.480222463607788 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7909775430676292, + "compression/movement_sparsity/importance_threshold": -0.0014639378596937149, + "compression/movement_sparsity/linear_layer_sparsity": 0.7824988538490069, + "compression/movement_sparsity/model_sparsity": 0.7556176109712524, + "compression_loss": 84.47374725341797, + "distillation_loss": 4.287975311279297, + "epoch": 3.22, + "learning_rate": 3.766319150934536e-05, + "loss": 88.2891, + "step": 3810, + "task_loss": 2.5096845626831055 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7912748722647573, + "compression/movement_sparsity/importance_threshold": -0.001461855444842908, + "compression/movement_sparsity/linear_layer_sparsity": 0.7827470792466826, + "compression/movement_sparsity/model_sparsity": 0.7558573090628873, + "compression_loss": 84.50497436523438, + "distillation_loss": 4.653452396392822, + "epoch": 3.22, + "learning_rate": 3.7658495350803044e-05, + "loss": 89.0286, + "step": 3811, + "task_loss": 2.8024399280548096 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7915719193658939, + "compression/movement_sparsity/importance_threshold": -0.001459775005717619, + "compression/movement_sparsity/linear_layer_sparsity": 0.7829279569455511, + "compression/movement_sparsity/model_sparsity": 0.7560319730563547, + "compression_loss": 84.5362777709961, + "distillation_loss": 2.995875597000122, + "epoch": 3.22, + "learning_rate": 3.765379919226073e-05, + "loss": 87.8015, + "step": 3812, + "task_loss": 0.972717821598053 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7918686845049245, + "compression/movement_sparsity/importance_threshold": -0.0014576965413801496, + "compression/movement_sparsity/linear_layer_sparsity": 0.7831754668931686, + "compression/movement_sparsity/model_sparsity": 0.7562709802758419, + "compression_loss": 84.56742095947266, + "distillation_loss": 4.405223846435547, + "epoch": 3.22, + "learning_rate": 3.7649103033718416e-05, + "loss": 88.9009, + "step": 3813, + "task_loss": 2.338395357131958 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7921651678157339, + "compression/movement_sparsity/importance_threshold": -0.001455620050892806, + "compression/movement_sparsity/linear_layer_sparsity": 0.7835576960867362, + "compression/movement_sparsity/model_sparsity": 0.756640078720736, + "compression_loss": 84.5986099243164, + "distillation_loss": 3.6552658081054688, + "epoch": 3.22, + "learning_rate": 3.764440687517611e-05, + "loss": 88.4083, + "step": 3814, + "task_loss": 2.494311571121216 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7924613694322076, + "compression/movement_sparsity/importance_threshold": -0.0014535455333178904, + "compression/movement_sparsity/linear_layer_sparsity": 0.7838047529159836, + "compression/movement_sparsity/model_sparsity": 0.7568786483878631, + "compression_loss": 84.62977600097656, + "distillation_loss": 4.410017967224121, + "epoch": 3.22, + "learning_rate": 3.7639710716633796e-05, + "loss": 88.7685, + "step": 3815, + "task_loss": 3.7206950187683105 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7927572894882304, + "compression/movement_sparsity/importance_threshold": -0.0014514729877177076, + "compression/movement_sparsity/linear_layer_sparsity": 0.7840937112703035, + "compression/movement_sparsity/model_sparsity": 0.7571576801337722, + "compression_loss": 84.6609115600586, + "distillation_loss": 3.899064540863037, + "epoch": 3.23, + "learning_rate": 3.763501455809148e-05, + "loss": 88.1543, + "step": 3816, + "task_loss": 2.4852349758148193 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7930529281176879, + "compression/movement_sparsity/importance_threshold": -0.0014494024131545599, + "compression/movement_sparsity/linear_layer_sparsity": 0.78443196413363, + "compression/movement_sparsity/model_sparsity": 0.7574843129706559, + "compression_loss": 84.69200897216797, + "distillation_loss": 3.362546920776367, + "epoch": 3.23, + "learning_rate": 3.763031839954917e-05, + "loss": 88.3906, + "step": 3817, + "task_loss": 2.102660655975342 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7933482854544647, + "compression/movement_sparsity/importance_threshold": -0.0014473338086907553, + "compression/movement_sparsity/linear_layer_sparsity": 0.7845679234930142, + "compression/movement_sparsity/model_sparsity": 0.7576156017077853, + "compression_loss": 84.72306060791016, + "distillation_loss": 3.972957134246826, + "epoch": 3.23, + "learning_rate": 3.7625622241006855e-05, + "loss": 88.4042, + "step": 3818, + "task_loss": 2.2127726078033447 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7936433616324464, + "compression/movement_sparsity/importance_threshold": -0.0014452671733885926, + "compression/movement_sparsity/linear_layer_sparsity": 0.784830064394321, + "compression/movement_sparsity/model_sparsity": 0.7578687372626924, + "compression_loss": 84.75406646728516, + "distillation_loss": 4.148760795593262, + "epoch": 3.23, + "learning_rate": 3.762092608246455e-05, + "loss": 88.2685, + "step": 3819, + "task_loss": 2.0772180557250977 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7939381567855177, + "compression/movement_sparsity/importance_threshold": -0.00144320250631038, + "compression/movement_sparsity/linear_layer_sparsity": 0.7850356251201956, + "compression/movement_sparsity/model_sparsity": 0.7580672363452545, + "compression_loss": 84.78510284423828, + "distillation_loss": 3.228693962097168, + "epoch": 3.23, + "learning_rate": 3.7616229923922234e-05, + "loss": 87.7965, + "step": 3820, + "task_loss": 3.539278745651245 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.794232671047564, + "compression/movement_sparsity/importance_threshold": -0.0014411398065184208, + "compression/movement_sparsity/linear_layer_sparsity": 0.7852586666758244, + "compression/movement_sparsity/model_sparsity": 0.7582826157372915, + "compression_loss": 84.81608581542969, + "distillation_loss": 4.823392391204834, + "epoch": 3.23, + "learning_rate": 3.761153376537992e-05, + "loss": 88.3342, + "step": 3821, + "task_loss": 2.860661506652832 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7945269045524707, + "compression/movement_sparsity/importance_threshold": -0.0014390790730750159, + "compression/movement_sparsity/linear_layer_sparsity": 0.7854252472976975, + "compression/movement_sparsity/model_sparsity": 0.7584434738023412, + "compression_loss": 84.84703826904297, + "distillation_loss": 2.765225410461426, + "epoch": 3.23, + "learning_rate": 3.760683760683761e-05, + "loss": 87.8674, + "step": 3822, + "task_loss": 2.1998605728149414 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7948208574341225, + "compression/movement_sparsity/importance_threshold": -0.001437020305042472, + "compression/movement_sparsity/linear_layer_sparsity": 0.7856843594604247, + "compression/movement_sparsity/model_sparsity": 0.7586936846651565, + "compression_loss": 84.87796020507812, + "distillation_loss": 4.972770690917969, + "epoch": 3.23, + "learning_rate": 3.76021414482953e-05, + "loss": 88.9424, + "step": 3823, + "task_loss": 2.1536340713500977 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7951145298264047, + "compression/movement_sparsity/importance_threshold": -0.001434963501483092, + "compression/movement_sparsity/linear_layer_sparsity": 0.785997309240028, + "compression/movement_sparsity/model_sparsity": 0.7589958836570843, + "compression_loss": 84.90882873535156, + "distillation_loss": 3.387336254119873, + "epoch": 3.23, + "learning_rate": 3.7597445289752987e-05, + "loss": 88.0395, + "step": 3824, + "task_loss": 1.59870183467865 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7954079218632024, + "compression/movement_sparsity/importance_threshold": -0.001432908661459181, + "compression/movement_sparsity/linear_layer_sparsity": 0.7862229144916985, + "compression/movement_sparsity/model_sparsity": 0.759213738674317, + "compression_loss": 84.9397201538086, + "distillation_loss": 3.674741744995117, + "epoch": 3.23, + "learning_rate": 3.7592749131210666e-05, + "loss": 87.8829, + "step": 3825, + "task_loss": 1.4453003406524658 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.795701033678401, + "compression/movement_sparsity/importance_threshold": -0.0014308557840330416, + "compression/movement_sparsity/linear_layer_sparsity": 0.7864662986773141, + "compression/movement_sparsity/model_sparsity": 0.7594487618644193, + "compression_loss": 84.97051239013672, + "distillation_loss": 2.7658677101135254, + "epoch": 3.23, + "learning_rate": 3.758805297266836e-05, + "loss": 88.4673, + "step": 3826, + "task_loss": 1.799147367477417 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.795993865405885, + "compression/movement_sparsity/importance_threshold": -0.0014288048682669805, + "compression/movement_sparsity/linear_layer_sparsity": 0.7868150328839926, + "compression/movement_sparsity/model_sparsity": 0.7597855159782665, + "compression_loss": 85.00128173828125, + "distillation_loss": 3.159055233001709, + "epoch": 3.23, + "learning_rate": 3.7583356814126046e-05, + "loss": 88.4391, + "step": 3827, + "task_loss": 1.5958257913589478 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7962864171795404, + "compression/movement_sparsity/importance_threshold": -0.0014267559132232974, + "compression/movement_sparsity/linear_layer_sparsity": 0.7870434403150575, + "compression/movement_sparsity/model_sparsity": 0.760006076911411, + "compression_loss": 85.03201293945312, + "distillation_loss": 5.409295082092285, + "epoch": 3.24, + "learning_rate": 3.757866065558374e-05, + "loss": 88.9511, + "step": 3828, + "task_loss": 3.3699069023132324 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7965786891332516, + "compression/movement_sparsity/importance_threshold": -0.0014247089179643003, + "compression/movement_sparsity/linear_layer_sparsity": 0.7873449428937302, + "compression/movement_sparsity/model_sparsity": 0.7602972219489759, + "compression_loss": 85.06273651123047, + "distillation_loss": 3.787811040878296, + "epoch": 3.24, + "learning_rate": 3.757396449704142e-05, + "loss": 89.4122, + "step": 3829, + "task_loss": 1.6658166646957397 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7968706814009041, + "compression/movement_sparsity/importance_threshold": -0.0014226638815522906, + "compression/movement_sparsity/linear_layer_sparsity": 0.7876464931690736, + "compression/movement_sparsity/model_sparsity": 0.760588413044684, + "compression_loss": 85.09345245361328, + "distillation_loss": 2.128357410430908, + "epoch": 3.24, + "learning_rate": 3.756926833849911e-05, + "loss": 88.0226, + "step": 3830, + "task_loss": 1.1244258880615234 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7971623941163832, + "compression/movement_sparsity/importance_threshold": -0.0014206208030495722, + "compression/movement_sparsity/linear_layer_sparsity": 0.787970997467116, + "compression/movement_sparsity/model_sparsity": 0.7609017696217967, + "compression_loss": 85.12411499023438, + "distillation_loss": 3.292328119277954, + "epoch": 3.24, + "learning_rate": 3.75645721799568e-05, + "loss": 88.6085, + "step": 3831, + "task_loss": 1.8629060983657837 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7974538274135735, + "compression/movement_sparsity/importance_threshold": -0.0014185796815184525, + "compression/movement_sparsity/linear_layer_sparsity": 0.7882475546870946, + "compression/movement_sparsity/model_sparsity": 0.7611688262504794, + "compression_loss": 85.15473937988281, + "distillation_loss": 5.397208213806152, + "epoch": 3.24, + "learning_rate": 3.7559876021414484e-05, + "loss": 89.1523, + "step": 3832, + "task_loss": 2.1503803730010986 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7977449814263609, + "compression/movement_sparsity/importance_threshold": -0.00141654051602123, + "compression/movement_sparsity/linear_layer_sparsity": 0.7885250062196458, + "compression/movement_sparsity/model_sparsity": 0.7614367464693468, + "compression_loss": 85.1853256225586, + "distillation_loss": 2.5712857246398926, + "epoch": 3.24, + "learning_rate": 3.755517986287217e-05, + "loss": 88.8309, + "step": 3833, + "task_loss": 1.2182261943817139 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7980358562886297, + "compression/movement_sparsity/importance_threshold": -0.0014145033056202141, + "compression/movement_sparsity/linear_layer_sparsity": 0.7887556911667293, + "compression/movement_sparsity/model_sparsity": 0.761659506678828, + "compression_loss": 85.21598052978516, + "distillation_loss": 6.438163757324219, + "epoch": 3.24, + "learning_rate": 3.755048370432986e-05, + "loss": 89.3204, + "step": 3834, + "task_loss": 4.295038223266602 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7983264521342657, + "compression/movement_sparsity/importance_threshold": -0.001412468049377705, + "compression/movement_sparsity/linear_layer_sparsity": 0.7889657473038026, + "compression/movement_sparsity/model_sparsity": 0.7618623467413846, + "compression_loss": 85.24652862548828, + "distillation_loss": 4.066577434539795, + "epoch": 3.24, + "learning_rate": 3.754578754578755e-05, + "loss": 88.9075, + "step": 3835, + "task_loss": 2.2365853786468506 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7986167690971536, + "compression/movement_sparsity/importance_threshold": -0.0014104347463560086, + "compression/movement_sparsity/linear_layer_sparsity": 0.7891614467430423, + "compression/movement_sparsity/model_sparsity": 0.7620513233028446, + "compression_loss": 85.27703857421875, + "distillation_loss": 3.2785258293151855, + "epoch": 3.24, + "learning_rate": 3.7541091387245236e-05, + "loss": 88.6209, + "step": 3836, + "task_loss": 1.4205782413482666 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.798906807311179, + "compression/movement_sparsity/importance_threshold": -0.0014084033956174268, + "compression/movement_sparsity/linear_layer_sparsity": 0.7894293827898202, + "compression/movement_sparsity/model_sparsity": 0.762310054922148, + "compression_loss": 85.30764770507812, + "distillation_loss": 4.5068817138671875, + "epoch": 3.24, + "learning_rate": 3.753639522870292e-05, + "loss": 88.8727, + "step": 3837, + "task_loss": 1.9963982105255127 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7991965669102266, + "compression/movement_sparsity/importance_threshold": -0.001406373996224267, + "compression/movement_sparsity/linear_layer_sparsity": 0.7897739316136585, + "compression/movement_sparsity/model_sparsity": 0.7626427674339312, + "compression_loss": 85.33812713623047, + "distillation_loss": 4.565608024597168, + "epoch": 3.24, + "learning_rate": 3.753169907016061e-05, + "loss": 89.3987, + "step": 3838, + "task_loss": 2.062997341156006 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7994860480281816, + "compression/movement_sparsity/importance_threshold": -0.0014043465472388314, + "compression/movement_sparsity/linear_layer_sparsity": 0.7900248161007171, + "compression/movement_sparsity/model_sparsity": 0.7628850332670484, + "compression_loss": 85.36865997314453, + "distillation_loss": 4.000123500823975, + "epoch": 3.24, + "learning_rate": 3.7527002911618295e-05, + "loss": 88.9871, + "step": 3839, + "task_loss": 1.7384661436080933 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.7997752507989294, + "compression/movement_sparsity/importance_threshold": -0.0014023210477234222, + "compression/movement_sparsity/linear_layer_sparsity": 0.7902616777666358, + "compression/movement_sparsity/model_sparsity": 0.7631137580060712, + "compression_loss": 85.39913177490234, + "distillation_loss": 4.044249534606934, + "epoch": 3.25, + "learning_rate": 3.752230675307599e-05, + "loss": 88.7668, + "step": 3840, + "task_loss": 2.224552869796753 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8000641753563549, + "compression/movement_sparsity/importance_threshold": -0.0014002974967403468, + "compression/movement_sparsity/linear_layer_sparsity": 0.7905169384232168, + "compression/movement_sparsity/model_sparsity": 0.7633602496738249, + "compression_loss": 85.42953491210938, + "distillation_loss": 6.171478748321533, + "epoch": 3.25, + "learning_rate": 3.7517610594533675e-05, + "loss": 89.5598, + "step": 3841, + "task_loss": 3.6494827270507812 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8003528218343435, + "compression/movement_sparsity/importance_threshold": -0.0013982758933519055, + "compression/movement_sparsity/linear_layer_sparsity": 0.7907662727684827, + "compression/movement_sparsity/model_sparsity": 0.7636010186172887, + "compression_loss": 85.45999145507812, + "distillation_loss": 3.000821590423584, + "epoch": 3.25, + "learning_rate": 3.751291443599136e-05, + "loss": 89.1276, + "step": 3842, + "task_loss": 1.9036200046539307 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8006411903667799, + "compression/movement_sparsity/importance_threshold": -0.0013962562366204058, + "compression/movement_sparsity/linear_layer_sparsity": 0.7910967987711811, + "compression/movement_sparsity/model_sparsity": 0.7639201900349776, + "compression_loss": 85.49034881591797, + "distillation_loss": 6.182601451873779, + "epoch": 3.25, + "learning_rate": 3.750821827744905e-05, + "loss": 89.7253, + "step": 3843, + "task_loss": 3.2545530796051025 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8009292810875499, + "compression/movement_sparsity/importance_threshold": -0.001394238525608148, + "compression/movement_sparsity/linear_layer_sparsity": 0.7913165254082072, + "compression/movement_sparsity/model_sparsity": 0.7641323683860636, + "compression_loss": 85.52069091796875, + "distillation_loss": 2.779271125793457, + "epoch": 3.25, + "learning_rate": 3.7503522118906734e-05, + "loss": 89.4272, + "step": 3844, + "task_loss": 2.387385845184326 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8012170941305379, + "compression/movement_sparsity/importance_threshold": -0.0013922227593774414, + "compression/movement_sparsity/linear_layer_sparsity": 0.7914998594856085, + "compression/movement_sparsity/model_sparsity": 0.7643094043739047, + "compression_loss": 85.55097198486328, + "distillation_loss": 3.1984975337982178, + "epoch": 3.25, + "learning_rate": 3.749882596036443e-05, + "loss": 89.1108, + "step": 3845, + "task_loss": 1.8234485387802124 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8015046296296298, + "compression/movement_sparsity/importance_threshold": -0.0013902089369905828, + "compression/movement_sparsity/linear_layer_sparsity": 0.7917470236323647, + "compression/movement_sparsity/model_sparsity": 0.764548077671854, + "compression_loss": 85.58130645751953, + "distillation_loss": 3.2152206897735596, + "epoch": 3.25, + "learning_rate": 3.7494129801822106e-05, + "loss": 88.8376, + "step": 3846, + "task_loss": 1.5068246126174927 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8017918877187099, + "compression/movement_sparsity/importance_threshold": -0.001388197057509883, + "compression/movement_sparsity/linear_layer_sparsity": 0.7919225950766352, + "compression/movement_sparsity/model_sparsity": 0.7647176176968927, + "compression_loss": 85.61157989501953, + "distillation_loss": 4.448546886444092, + "epoch": 3.25, + "learning_rate": 3.74894336432798e-05, + "loss": 89.3822, + "step": 3847, + "task_loss": 3.531698703765869 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8020788685316641, + "compression/movement_sparsity/importance_threshold": -0.0013861871199976408, + "compression/movement_sparsity/linear_layer_sparsity": 0.7921442057321477, + "compression/movement_sparsity/model_sparsity": 0.7649316153446343, + "compression_loss": 85.64181518554688, + "distillation_loss": 3.885610342025757, + "epoch": 3.25, + "learning_rate": 3.7484737484737486e-05, + "loss": 88.9152, + "step": 3848, + "task_loss": 1.8488744497299194 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8023655722023769, + "compression/movement_sparsity/importance_threshold": -0.0013841791235161653, + "compression/movement_sparsity/linear_layer_sparsity": 0.7923867433018612, + "compression/movement_sparsity/model_sparsity": 0.7651658210026953, + "compression_loss": 85.67201232910156, + "distillation_loss": 3.7141404151916504, + "epoch": 3.25, + "learning_rate": 3.748004132619517e-05, + "loss": 89.6597, + "step": 3849, + "task_loss": 2.290447473526001 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.802651998864734, + "compression/movement_sparsity/importance_threshold": -0.0013821730671277552, + "compression/movement_sparsity/linear_layer_sparsity": 0.7926834881376472, + "compression/movement_sparsity/model_sparsity": 0.7654523717404782, + "compression_loss": 85.70210266113281, + "distillation_loss": 3.9642534255981445, + "epoch": 3.25, + "learning_rate": 3.7475345167652865e-05, + "loss": 88.8215, + "step": 3850, + "task_loss": 1.4042876958847046 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8029381486526203, + "compression/movement_sparsity/importance_threshold": -0.0013801689498947178, + "compression/movement_sparsity/linear_layer_sparsity": 0.7929777169740622, + "compression/movement_sparsity/model_sparsity": 0.7657364929112085, + "compression_loss": 85.73225402832031, + "distillation_loss": 4.614724636077881, + "epoch": 3.26, + "learning_rate": 3.7470649009110545e-05, + "loss": 89.2819, + "step": 3851, + "task_loss": 2.7875442504882812 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8032240216999207, + "compression/movement_sparsity/importance_threshold": -0.0013781667708793579, + "compression/movement_sparsity/linear_layer_sparsity": 0.7932184301461274, + "compression/movement_sparsity/model_sparsity": 0.7659689368452929, + "compression_loss": 85.76235961914062, + "distillation_loss": 5.234553337097168, + "epoch": 3.26, + "learning_rate": 3.746595285056824e-05, + "loss": 89.7429, + "step": 3852, + "task_loss": 2.6951162815093994 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8035096181405209, + "compression/movement_sparsity/importance_threshold": -0.0013761665291439741, + "compression/movement_sparsity/linear_layer_sparsity": 0.7933027220871453, + "compression/movement_sparsity/model_sparsity": 0.7660503330988244, + "compression_loss": 85.7923812866211, + "distillation_loss": 3.2017905712127686, + "epoch": 3.26, + "learning_rate": 3.7461256692025924e-05, + "loss": 89.4474, + "step": 3853, + "task_loss": 1.3360157012939453 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8037949381083055, + "compression/movement_sparsity/importance_threshold": -0.0013741682237508766, + "compression/movement_sparsity/linear_layer_sparsity": 0.7935495881297105, + "compression/movement_sparsity/model_sparsity": 0.7662887185333789, + "compression_loss": 85.82234954833984, + "distillation_loss": 3.7438907623291016, + "epoch": 3.26, + "learning_rate": 3.745656053348362e-05, + "loss": 90.5502, + "step": 3854, + "task_loss": 1.6186754703521729 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.80407998173716, + "compression/movement_sparsity/importance_threshold": -0.0013721718537623656, + "compression/movement_sparsity/linear_layer_sparsity": 0.7936648471340788, + "compression/movement_sparsity/model_sparsity": 0.7664000180363689, + "compression_loss": 85.85240173339844, + "distillation_loss": 3.18158221244812, + "epoch": 3.26, + "learning_rate": 3.74518643749413e-05, + "loss": 89.1389, + "step": 3855, + "task_loss": 2.566549777984619 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8043647491609693, + "compression/movement_sparsity/importance_threshold": -0.001370177418240747, + "compression/movement_sparsity/linear_layer_sparsity": 0.7939893395079535, + "compression/movement_sparsity/model_sparsity": 0.7667133630989457, + "compression_loss": 85.88236999511719, + "distillation_loss": 2.4177823066711426, + "epoch": 3.26, + "learning_rate": 3.744716821639899e-05, + "loss": 88.9918, + "step": 3856, + "task_loss": 2.462205171585083 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8046492405136187, + "compression/movement_sparsity/importance_threshold": -0.0013681849162483243, + "compression/movement_sparsity/linear_layer_sparsity": 0.7941820102086138, + "compression/movement_sparsity/model_sparsity": 0.7668994149683139, + "compression_loss": 85.91234588623047, + "distillation_loss": 4.64054012298584, + "epoch": 3.26, + "learning_rate": 3.7442472057856676e-05, + "loss": 89.561, + "step": 3857, + "task_loss": 3.1398696899414062 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8049334559289932, + "compression/movement_sparsity/importance_threshold": -0.0013661943468474009, + "compression/movement_sparsity/linear_layer_sparsity": 0.794403465849947, + "compression/movement_sparsity/model_sparsity": 0.7671132629270901, + "compression_loss": 85.94224548339844, + "distillation_loss": 2.4275131225585938, + "epoch": 3.26, + "learning_rate": 3.743777589931436e-05, + "loss": 89.0713, + "step": 3858, + "task_loss": 1.8161195516586304 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8052173955409783, + "compression/movement_sparsity/importance_threshold": -0.0013642057091002805, + "compression/movement_sparsity/linear_layer_sparsity": 0.7946538733703001, + "compression/movement_sparsity/model_sparsity": 0.7673550681787754, + "compression_loss": 85.97218322753906, + "distillation_loss": 5.4072184562683105, + "epoch": 3.26, + "learning_rate": 3.743307974077205e-05, + "loss": 89.9296, + "step": 3859, + "task_loss": 2.4943315982818604 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8055010594834585, + "compression/movement_sparsity/importance_threshold": -0.0013622190020692688, + "compression/movement_sparsity/linear_layer_sparsity": 0.7949197226877417, + "compression/movement_sparsity/model_sparsity": 0.7676117847543148, + "compression_loss": 86.00200653076172, + "distillation_loss": 4.195233345031738, + "epoch": 3.26, + "learning_rate": 3.7428383582229735e-05, + "loss": 90.225, + "step": 3860, + "task_loss": 2.5529863834381104 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8057844478903194, + "compression/movement_sparsity/importance_threshold": -0.0013602342248166688, + "compression/movement_sparsity/linear_layer_sparsity": 0.7952802737445474, + "compression/movement_sparsity/model_sparsity": 0.7679599497731343, + "compression_loss": 86.03182220458984, + "distillation_loss": 3.866274356842041, + "epoch": 3.26, + "learning_rate": 3.742368742368743e-05, + "loss": 89.8705, + "step": 3861, + "task_loss": 2.6183953285217285 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8060675608954462, + "compression/movement_sparsity/importance_threshold": -0.0013582513764047836, + "compression/movement_sparsity/linear_layer_sparsity": 0.795507119109652, + "compression/movement_sparsity/model_sparsity": 0.7681790023020896, + "compression_loss": 86.06163787841797, + "distillation_loss": 3.0795345306396484, + "epoch": 3.26, + "learning_rate": 3.7418991265145115e-05, + "loss": 88.9724, + "step": 3862, + "task_loss": 1.9453215599060059 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8063503986327237, + "compression/movement_sparsity/importance_threshold": -0.0013562704558959188, + "compression/movement_sparsity/linear_layer_sparsity": 0.7956969280100796, + "compression/movement_sparsity/model_sparsity": 0.7683622906828671, + "compression_loss": 86.0913314819336, + "distillation_loss": 2.7485480308532715, + "epoch": 3.27, + "learning_rate": 3.74142951066028e-05, + "loss": 90.1207, + "step": 3863, + "task_loss": 1.1412886381149292 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8066329612360374, + "compression/movement_sparsity/importance_threshold": -0.0013542914623523757, + "compression/movement_sparsity/linear_layer_sparsity": 0.7959748803576716, + "compression/movement_sparsity/model_sparsity": 0.7686306945122379, + "compression_loss": 86.12100219726562, + "distillation_loss": 3.6474804878234863, + "epoch": 3.27, + "learning_rate": 3.740959894806049e-05, + "loss": 89.8424, + "step": 3864, + "task_loss": 1.7248187065124512 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8069152488392721, + "compression/movement_sparsity/importance_threshold": -0.0013523143948364624, + "compression/movement_sparsity/linear_layer_sparsity": 0.7962524869044021, + "compression/movement_sparsity/model_sparsity": 0.7688987644200705, + "compression_loss": 86.15070343017578, + "distillation_loss": 3.387557029724121, + "epoch": 3.27, + "learning_rate": 3.7404902789518174e-05, + "loss": 89.7969, + "step": 3865, + "task_loss": 1.395418643951416 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8071972615763134, + "compression/movement_sparsity/importance_threshold": -0.0013503392524104778, + "compression/movement_sparsity/linear_layer_sparsity": 0.7964429754823851, + "compression/movement_sparsity/model_sparsity": 0.7690827091293884, + "compression_loss": 86.18028259277344, + "distillation_loss": 3.523892879486084, + "epoch": 3.27, + "learning_rate": 3.740020663097587e-05, + "loss": 89.5473, + "step": 3866, + "task_loss": 2.007884979248047 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8074789995810459, + "compression/movement_sparsity/importance_threshold": -0.0013483660341367318, + "compression/movement_sparsity/linear_layer_sparsity": 0.7966732550077689, + "compression/movement_sparsity/model_sparsity": 0.7693050778446525, + "compression_loss": 86.20990753173828, + "distillation_loss": 3.096726894378662, + "epoch": 3.27, + "learning_rate": 3.739551047243355e-05, + "loss": 89.5338, + "step": 3867, + "task_loss": 1.8810862302780151 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8077604629873552, + "compression/movement_sparsity/importance_threshold": -0.0013463947390775222, + "compression/movement_sparsity/linear_layer_sparsity": 0.7968820948797434, + "compression/movement_sparsity/model_sparsity": 0.7695067434245582, + "compression_loss": 86.239501953125, + "distillation_loss": 4.6692609786987305, + "epoch": 3.27, + "learning_rate": 3.739081431389124e-05, + "loss": 89.6781, + "step": 3868, + "task_loss": 1.598134160041809 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8080416519291259, + "compression/movement_sparsity/importance_threshold": -0.001344425366295158, + "compression/movement_sparsity/linear_layer_sparsity": 0.7971148307836601, + "compression/movement_sparsity/model_sparsity": 0.769731484134196, + "compression_loss": 86.26899719238281, + "distillation_loss": 5.1784796714782715, + "epoch": 3.27, + "learning_rate": 3.7386118155348926e-05, + "loss": 90.0477, + "step": 3869, + "task_loss": 2.346848964691162 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8083225665402436, + "compression/movement_sparsity/importance_threshold": -0.0013424579148519416, + "compression/movement_sparsity/linear_layer_sparsity": 0.7973260077924913, + "compression/movement_sparsity/model_sparsity": 0.7699354065631173, + "compression_loss": 86.29850769042969, + "distillation_loss": 6.489777565002441, + "epoch": 3.27, + "learning_rate": 3.738142199680661e-05, + "loss": 90.9212, + "step": 3870, + "task_loss": 2.9595510959625244 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8086032069545934, + "compression/movement_sparsity/importance_threshold": -0.001340492383810175, + "compression/movement_sparsity/linear_layer_sparsity": 0.7976084317029467, + "compression/movement_sparsity/model_sparsity": 0.770208128343411, + "compression_loss": 86.32801818847656, + "distillation_loss": 3.1317710876464844, + "epoch": 3.27, + "learning_rate": 3.7376725838264305e-05, + "loss": 90.1283, + "step": 3871, + "task_loss": 2.0601143836975098 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8088835733060602, + "compression/movement_sparsity/importance_threshold": -0.0013385287722321657, + "compression/movement_sparsity/linear_layer_sparsity": 0.7978264293356655, + "compression/movement_sparsity/model_sparsity": 0.7704186370868068, + "compression_loss": 86.35747528076172, + "distillation_loss": 5.940177917480469, + "epoch": 3.27, + "learning_rate": 3.7372029679721985e-05, + "loss": 90.7524, + "step": 3872, + "task_loss": 3.879319667816162 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8091636657285296, + "compression/movement_sparsity/importance_threshold": -0.001336567079180213, + "compression/movement_sparsity/linear_layer_sparsity": 0.7980464779252178, + "compression/movement_sparsity/model_sparsity": 0.7706311263303592, + "compression_loss": 86.38692474365234, + "distillation_loss": 4.192668914794922, + "epoch": 3.27, + "learning_rate": 3.736733352117968e-05, + "loss": 90.1243, + "step": 3873, + "task_loss": 2.8153023719787598 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8094434843558862, + "compression/movement_sparsity/importance_threshold": -0.0013346073037166254, + "compression/movement_sparsity/linear_layer_sparsity": 0.7982896355516482, + "compression/movement_sparsity/model_sparsity": 0.7708659307442816, + "compression_loss": 86.41630554199219, + "distillation_loss": 4.960709571838379, + "epoch": 3.27, + "learning_rate": 3.7362637362637365e-05, + "loss": 90.1977, + "step": 3874, + "task_loss": 2.4870309829711914 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8097230293220155, + "compression/movement_sparsity/importance_threshold": -0.001332649444903705, + "compression/movement_sparsity/linear_layer_sparsity": 0.7985713559362131, + "compression/movement_sparsity/model_sparsity": 0.7711379731669633, + "compression_loss": 86.44563293457031, + "distillation_loss": 3.0128109455108643, + "epoch": 3.28, + "learning_rate": 3.735794120409505e-05, + "loss": 89.6698, + "step": 3875, + "task_loss": 1.6253968477249146 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8100023007608024, + "compression/movement_sparsity/importance_threshold": -0.0013306935018037556, + "compression/movement_sparsity/linear_layer_sparsity": 0.7987788126047419, + "compression/movement_sparsity/model_sparsity": 0.7713383030607167, + "compression_loss": 86.47492980957031, + "distillation_loss": 4.748160362243652, + "epoch": 3.28, + "learning_rate": 3.735324504555274e-05, + "loss": 89.7852, + "step": 3876, + "task_loss": 3.0472424030303955 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8102812988061322, + "compression/movement_sparsity/importance_threshold": -0.0013287394734790813, + "compression/movement_sparsity/linear_layer_sparsity": 0.7989338506323433, + "compression/movement_sparsity/model_sparsity": 0.7714880150551173, + "compression_loss": 86.50424194335938, + "distillation_loss": 4.278330326080322, + "epoch": 3.28, + "learning_rate": 3.7348548887010424e-05, + "loss": 90.8193, + "step": 3877, + "task_loss": 2.4647650718688965 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8105600235918899, + "compression/movement_sparsity/importance_threshold": -0.0013267873589919866, + "compression/movement_sparsity/linear_layer_sparsity": 0.7992155590927407, + "compression/movement_sparsity/model_sparsity": 0.7717600459632634, + "compression_loss": 86.53349304199219, + "distillation_loss": 3.979754686355591, + "epoch": 3.28, + "learning_rate": 3.734385272846812e-05, + "loss": 90.3113, + "step": 3878, + "task_loss": 2.9081761837005615 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8108384752519607, + "compression/movement_sparsity/importance_threshold": -0.0013248371574047756, + "compression/movement_sparsity/linear_layer_sparsity": 0.7994524565311624, + "compression/movement_sparsity/model_sparsity": 0.7719888052458935, + "compression_loss": 86.56272888183594, + "distillation_loss": 4.393070697784424, + "epoch": 3.28, + "learning_rate": 3.73391565699258e-05, + "loss": 90.5074, + "step": 3879, + "task_loss": 1.6161513328552246 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.81111665392023, + "compression/movement_sparsity/importance_threshold": -0.0013228888677797503, + "compression/movement_sparsity/linear_layer_sparsity": 0.7996014370816048, + "compression/movement_sparsity/model_sparsity": 0.7721326678561106, + "compression_loss": 86.5919189453125, + "distillation_loss": 3.9516634941101074, + "epoch": 3.28, + "learning_rate": 3.7334460411383496e-05, + "loss": 90.3049, + "step": 3880, + "task_loss": 2.8951797485351562 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8113945597305825, + "compression/movement_sparsity/importance_threshold": -0.0013209424891792183, + "compression/movement_sparsity/linear_layer_sparsity": 0.7998888929908026, + "compression/movement_sparsity/model_sparsity": 0.7724102487705096, + "compression_loss": 86.62108612060547, + "distillation_loss": 2.451702117919922, + "epoch": 3.28, + "learning_rate": 3.7329764252841176e-05, + "loss": 89.7498, + "step": 3881, + "task_loss": 0.7217500805854797 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8116721928169038, + "compression/movement_sparsity/importance_threshold": -0.001318998020665479, + "compression/movement_sparsity/linear_layer_sparsity": 0.8001335649865229, + "compression/movement_sparsity/model_sparsity": 0.7726465155304777, + "compression_loss": 86.65023040771484, + "distillation_loss": 3.5232186317443848, + "epoch": 3.28, + "learning_rate": 3.732506809429886e-05, + "loss": 90.262, + "step": 3882, + "task_loss": 1.8981250524520874 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8119495533130786, + "compression/movement_sparsity/importance_threshold": -0.0013170554613008405, + "compression/movement_sparsity/linear_layer_sparsity": 0.8004067715153957, + "compression/movement_sparsity/model_sparsity": 0.7729103365746023, + "compression_loss": 86.6793441772461, + "distillation_loss": 4.564419746398926, + "epoch": 3.28, + "learning_rate": 3.7320371935756555e-05, + "loss": 91.0749, + "step": 3883, + "task_loss": 3.2512624263763428 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8122266413529925, + "compression/movement_sparsity/importance_threshold": -0.0013151148101476036, + "compression/movement_sparsity/linear_layer_sparsity": 0.8005987267659979, + "compression/movement_sparsity/model_sparsity": 0.7730956975718227, + "compression_loss": 86.70841979980469, + "distillation_loss": 4.30769157409668, + "epoch": 3.28, + "learning_rate": 3.731567577721424e-05, + "loss": 90.0934, + "step": 3884, + "task_loss": 1.9805569648742676 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8125034570705301, + "compression/movement_sparsity/importance_threshold": -0.0013131760662680762, + "compression/movement_sparsity/linear_layer_sparsity": 0.8008830943157779, + "compression/movement_sparsity/model_sparsity": 0.7733702962214509, + "compression_loss": 86.73739624023438, + "distillation_loss": 3.534207582473755, + "epoch": 3.28, + "learning_rate": 3.731097961867193e-05, + "loss": 90.5623, + "step": 3885, + "task_loss": 1.7491908073425293 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.812780000599577, + "compression/movement_sparsity/importance_threshold": -0.0013112392287245572, + "compression/movement_sparsity/linear_layer_sparsity": 0.8010370830166275, + "compression/movement_sparsity/model_sparsity": 0.7735189949367016, + "compression_loss": 86.76636505126953, + "distillation_loss": 3.468538761138916, + "epoch": 3.28, + "learning_rate": 3.7306283460129614e-05, + "loss": 90.3915, + "step": 3886, + "task_loss": 2.10611891746521 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.813056272074018, + "compression/movement_sparsity/importance_threshold": -0.0013093042965793555, + "compression/movement_sparsity/linear_layer_sparsity": 0.8011666867946616, + "compression/movement_sparsity/model_sparsity": 0.7736441464262526, + "compression_loss": 86.79533386230469, + "distillation_loss": 3.7590949535369873, + "epoch": 3.29, + "learning_rate": 3.730158730158731e-05, + "loss": 90.5079, + "step": 3887, + "task_loss": 1.6393036842346191 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8133322716277386, + "compression/movement_sparsity/importance_threshold": -0.001307371268894771, + "compression/movement_sparsity/linear_layer_sparsity": 0.8014187756226514, + "compression/movement_sparsity/model_sparsity": 0.7738875752274849, + "compression_loss": 86.82425689697266, + "distillation_loss": 3.8149752616882324, + "epoch": 3.29, + "learning_rate": 3.7296891143044994e-05, + "loss": 90.3452, + "step": 3888, + "task_loss": 2.3185791969299316 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8136079993946235, + "compression/movement_sparsity/importance_threshold": -0.0013054401447331106, + "compression/movement_sparsity/linear_layer_sparsity": 0.8016001183640576, + "compression/movement_sparsity/model_sparsity": 0.7740626882878483, + "compression_loss": 86.8531265258789, + "distillation_loss": 3.706239700317383, + "epoch": 3.29, + "learning_rate": 3.729219498450267e-05, + "loss": 90.5185, + "step": 3889, + "task_loss": 2.384315252304077 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8138834555085583, + "compression/movement_sparsity/importance_threshold": -0.001303510923156676, + "compression/movement_sparsity/linear_layer_sparsity": 0.801704896025074, + "compression/movement_sparsity/model_sparsity": 0.7741638665138749, + "compression_loss": 86.8820571899414, + "distillation_loss": 3.500173807144165, + "epoch": 3.29, + "learning_rate": 3.7287498825960366e-05, + "loss": 90.714, + "step": 3890, + "task_loss": 2.0333919525146484 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8141586401034278, + "compression/movement_sparsity/importance_threshold": -0.0013015836032277735, + "compression/movement_sparsity/linear_layer_sparsity": 0.8020942081742173, + "compression/movement_sparsity/model_sparsity": 0.7745398045930308, + "compression_loss": 86.91093444824219, + "distillation_loss": 4.866014003753662, + "epoch": 3.29, + "learning_rate": 3.728280266741805e-05, + "loss": 90.7699, + "step": 3891, + "task_loss": 2.6550991535186768 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8144335533131172, + "compression/movement_sparsity/importance_threshold": -0.0012996581840087063, + "compression/movement_sparsity/linear_layer_sparsity": 0.8023458677321721, + "compression/movement_sparsity/model_sparsity": 0.7747828188709747, + "compression_loss": 86.93978118896484, + "distillation_loss": 3.7373569011688232, + "epoch": 3.29, + "learning_rate": 3.7278106508875746e-05, + "loss": 90.8806, + "step": 3892, + "task_loss": 2.115870714187622 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8147081952715118, + "compression/movement_sparsity/importance_threshold": -0.0012977346645617772, + "compression/movement_sparsity/linear_layer_sparsity": 0.8025454663742289, + "compression/movement_sparsity/model_sparsity": 0.7749755606856394, + "compression_loss": 86.96858215332031, + "distillation_loss": 4.105489730834961, + "epoch": 3.29, + "learning_rate": 3.7273410350333425e-05, + "loss": 90.4358, + "step": 3893, + "task_loss": 2.2087836265563965 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8149825661124966, + "compression/movement_sparsity/importance_threshold": -0.001295813043949291, + "compression/movement_sparsity/linear_layer_sparsity": 0.8027230172303268, + "compression/movement_sparsity/model_sparsity": 0.7751470121236202, + "compression_loss": 86.99742126464844, + "distillation_loss": 5.379485130310059, + "epoch": 3.29, + "learning_rate": 3.726871419179112e-05, + "loss": 90.8174, + "step": 3894, + "task_loss": 2.113665819168091 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.815256665969957, + "compression/movement_sparsity/importance_threshold": -0.001293893321233551, + "compression/movement_sparsity/linear_layer_sparsity": 0.8029855516291656, + "compression/movement_sparsity/model_sparsity": 0.7754005276582085, + "compression_loss": 87.02616882324219, + "distillation_loss": 2.650883197784424, + "epoch": 3.29, + "learning_rate": 3.7264018033248805e-05, + "loss": 90.7836, + "step": 3895, + "task_loss": 0.9732071161270142 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8155304949777777, + "compression/movement_sparsity/importance_threshold": -0.0012919754954768634, + "compression/movement_sparsity/linear_layer_sparsity": 0.8030973049282489, + "compression/movement_sparsity/model_sparsity": 0.775508441887675, + "compression_loss": 87.05491638183594, + "distillation_loss": 2.632117986679077, + "epoch": 3.29, + "learning_rate": 3.725932187470649e-05, + "loss": 90.4978, + "step": 3896, + "task_loss": 1.5082505941390991 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8158040532698443, + "compression/movement_sparsity/importance_threshold": -0.0012900595657415288, + "compression/movement_sparsity/linear_layer_sparsity": 0.8033497514812677, + "compression/movement_sparsity/model_sparsity": 0.7757522161249812, + "compression_loss": 87.0836410522461, + "distillation_loss": 3.9015860557556152, + "epoch": 3.29, + "learning_rate": 3.7254625716164184e-05, + "loss": 90.3562, + "step": 3897, + "task_loss": 3.559332847595215 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8160773409800414, + "compression/movement_sparsity/importance_threshold": -0.0012881455310898553, + "compression/movement_sparsity/linear_layer_sparsity": 0.8036510751974261, + "compression/movement_sparsity/model_sparsity": 0.7760431884445093, + "compression_loss": 87.11227416992188, + "distillation_loss": 4.59242582321167, + "epoch": 3.29, + "learning_rate": 3.7249929557621864e-05, + "loss": 91.299, + "step": 3898, + "task_loss": 1.2761034965515137 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8163503582422547, + "compression/movement_sparsity/importance_threshold": -0.0012862333905841426, + "compression/movement_sparsity/linear_layer_sparsity": 0.8038668907074675, + "compression/movement_sparsity/model_sparsity": 0.7762515900278547, + "compression_loss": 87.14088439941406, + "distillation_loss": 3.761213779449463, + "epoch": 3.3, + "learning_rate": 3.724523339907956e-05, + "loss": 90.4576, + "step": 3899, + "task_loss": 1.852061152458191 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8166231051903688, + "compression/movement_sparsity/importance_threshold": -0.001284323143286698, + "compression/movement_sparsity/linear_layer_sparsity": 0.8041046347617914, + "compression/movement_sparsity/model_sparsity": 0.7764811668425262, + "compression_loss": 87.1695327758789, + "distillation_loss": 3.510695457458496, + "epoch": 3.3, + "learning_rate": 3.724053724053724e-05, + "loss": 90.9901, + "step": 3900, + "task_loss": 3.1880505084991455 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8168955819582693, + "compression/movement_sparsity/importance_threshold": -0.0012824147882598246, + "compression/movement_sparsity/linear_layer_sparsity": 0.8040968482803251, + "compression/movement_sparsity/model_sparsity": 0.7764736478506524, + "compression_loss": 87.19807434082031, + "distillation_loss": 3.4244914054870605, + "epoch": 3.3, + "learning_rate": 3.723584108199493e-05, + "loss": 90.5658, + "step": 3901, + "task_loss": 2.0118608474731445 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8171677886798413, + "compression/movement_sparsity/importance_threshold": -0.0012805083245658235, + "compression/movement_sparsity/linear_layer_sparsity": 0.8043192459309016, + "compression/movement_sparsity/model_sparsity": 0.7766884054577564, + "compression_loss": 87.2265853881836, + "distillation_loss": 4.648602485656738, + "epoch": 3.3, + "learning_rate": 3.7231144923452616e-05, + "loss": 91.4149, + "step": 3902, + "task_loss": 3.4610774517059326 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8174397254889696, + "compression/movement_sparsity/importance_threshold": -0.0012786037512670032, + "compression/movement_sparsity/linear_layer_sparsity": 0.8045043567092808, + "compression/movement_sparsity/model_sparsity": 0.7768671571114308, + "compression_loss": 87.25508117675781, + "distillation_loss": 3.6228928565979004, + "epoch": 3.3, + "learning_rate": 3.72264487649103e-05, + "loss": 91.1295, + "step": 3903, + "task_loss": 1.9063886404037476 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8177113925195397, + "compression/movement_sparsity/importance_threshold": -0.0012767010674256648, + "compression/movement_sparsity/linear_layer_sparsity": 0.8047339923296122, + "compression/movement_sparsity/model_sparsity": 0.7770889040417621, + "compression_loss": 87.28352355957031, + "distillation_loss": 3.057570457458496, + "epoch": 3.3, + "learning_rate": 3.7221752606367995e-05, + "loss": 90.6942, + "step": 3904, + "task_loss": 2.9712564945220947 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8179827899054365, + "compression/movement_sparsity/importance_threshold": -0.001274800272104113, + "compression/movement_sparsity/linear_layer_sparsity": 0.8049187692312976, + "compression/movement_sparsity/model_sparsity": 0.7772673332884343, + "compression_loss": 87.31199645996094, + "distillation_loss": 3.0461325645446777, + "epoch": 3.3, + "learning_rate": 3.721705644782568e-05, + "loss": 90.8988, + "step": 3905, + "task_loss": 1.039448618888855 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8182539177805455, + "compression/movement_sparsity/importance_threshold": -0.0012729013643646503, + "compression/movement_sparsity/linear_layer_sparsity": 0.8051465685298131, + "compression/movement_sparsity/model_sparsity": 0.7774873069802533, + "compression_loss": 87.34043884277344, + "distillation_loss": 4.027436256408691, + "epoch": 3.3, + "learning_rate": 3.721236028928337e-05, + "loss": 91.2984, + "step": 3906, + "task_loss": 1.8307119607925415 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8185247762787512, + "compression/movement_sparsity/importance_threshold": -0.0012710043432695847, + "compression/movement_sparsity/linear_layer_sparsity": 0.8053643873000174, + "compression/movement_sparsity/model_sparsity": 0.7776976430056121, + "compression_loss": 87.36880493164062, + "distillation_loss": 3.299128770828247, + "epoch": 3.3, + "learning_rate": 3.7207664130741054e-05, + "loss": 91.0946, + "step": 3907, + "task_loss": 1.9460349082946777 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8187953655339393, + "compression/movement_sparsity/importance_threshold": -0.0012691092078812175, + "compression/movement_sparsity/linear_layer_sparsity": 0.8057155898093965, + "compression/movement_sparsity/model_sparsity": 0.7780367806283687, + "compression_loss": 87.39716339111328, + "distillation_loss": 3.5561137199401855, + "epoch": 3.3, + "learning_rate": 3.720296797219874e-05, + "loss": 91.1712, + "step": 3908, + "task_loss": 1.7611490488052368 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8190656856799947, + "compression/movement_sparsity/importance_threshold": -0.0012672159572618517, + "compression/movement_sparsity/linear_layer_sparsity": 0.8059746304271179, + "compression/movement_sparsity/model_sparsity": 0.7782869224039694, + "compression_loss": 87.42545318603516, + "distillation_loss": 3.0332374572753906, + "epoch": 3.3, + "learning_rate": 3.7198271813656434e-05, + "loss": 91.0947, + "step": 3909, + "task_loss": 1.8922971487045288 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8193357368508025, + "compression/movement_sparsity/importance_threshold": -0.001265324590473794, + "compression/movement_sparsity/linear_layer_sparsity": 0.8061248153184917, + "compression/movement_sparsity/model_sparsity": 0.7784319479823015, + "compression_loss": 87.45381927490234, + "distillation_loss": 4.923306941986084, + "epoch": 3.3, + "learning_rate": 3.7193575655114113e-05, + "loss": 91.757, + "step": 3910, + "task_loss": 2.933204174041748 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8196055191802482, + "compression/movement_sparsity/importance_threshold": -0.0012634351065793446, + "compression/movement_sparsity/linear_layer_sparsity": 0.8063399630751455, + "compression/movement_sparsity/model_sparsity": 0.7786397047516425, + "compression_loss": 87.48209381103516, + "distillation_loss": 4.682652473449707, + "epoch": 3.31, + "learning_rate": 3.7188879496571807e-05, + "loss": 91.762, + "step": 3911, + "task_loss": 1.5890557765960693 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8198750328022164, + "compression/movement_sparsity/importance_threshold": -0.0012615475046408118, + "compression/movement_sparsity/linear_layer_sparsity": 0.8066580163984969, + "compression/movement_sparsity/model_sparsity": 0.7789468319648902, + "compression_loss": 87.51036071777344, + "distillation_loss": 2.591740608215332, + "epoch": 3.31, + "learning_rate": 3.718418333802949e-05, + "loss": 90.7678, + "step": 3912, + "task_loss": 1.3380464315414429 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8201442778505927, + "compression/movement_sparsity/importance_threshold": -0.001259661783720496, + "compression/movement_sparsity/linear_layer_sparsity": 0.8068641971810886, + "compression/movement_sparsity/model_sparsity": 0.7791459298033137, + "compression_loss": 87.5385513305664, + "distillation_loss": 3.4777636528015137, + "epoch": 3.31, + "learning_rate": 3.717948717948718e-05, + "loss": 90.8966, + "step": 3913, + "task_loss": 1.8086028099060059 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8204132544592619, + "compression/movement_sparsity/importance_threshold": -0.001257777942880704, + "compression/movement_sparsity/linear_layer_sparsity": 0.8071437235188085, + "compression/movement_sparsity/model_sparsity": 0.7794158535514093, + "compression_loss": 87.56670379638672, + "distillation_loss": 1.9852522611618042, + "epoch": 3.31, + "learning_rate": 3.717479102094487e-05, + "loss": 91.1431, + "step": 3914, + "task_loss": 1.2358969449996948 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8206819627621096, + "compression/movement_sparsity/importance_threshold": -0.0012558959811837366, + "compression/movement_sparsity/linear_layer_sparsity": 0.8073720355565324, + "compression/movement_sparsity/model_sparsity": 0.7796363223682673, + "compression_loss": 87.59481811523438, + "distillation_loss": 4.498531818389893, + "epoch": 3.31, + "learning_rate": 3.717009486240255e-05, + "loss": 91.622, + "step": 3915, + "task_loss": 2.271231174468994 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8209504028930203, + "compression/movement_sparsity/importance_threshold": -0.0012540158976919015, + "compression/movement_sparsity/linear_layer_sparsity": 0.8076166360072469, + "compression/movement_sparsity/model_sparsity": 0.7798725200410208, + "compression_loss": 87.62295532226562, + "distillation_loss": 4.416722297668457, + "epoch": 3.31, + "learning_rate": 3.7165398703860245e-05, + "loss": 91.6264, + "step": 3916, + "task_loss": 3.185558795928955 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8212185749858798, + "compression/movement_sparsity/importance_threshold": -0.001252137691467499, + "compression/movement_sparsity/linear_layer_sparsity": 0.8078009478663943, + "compression/movement_sparsity/model_sparsity": 0.780050500220797, + "compression_loss": 87.65107727050781, + "distillation_loss": 3.485424518585205, + "epoch": 3.31, + "learning_rate": 3.716070254531793e-05, + "loss": 91.2902, + "step": 3917, + "task_loss": 1.4358606338500977 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8214864791745726, + "compression/movement_sparsity/importance_threshold": -0.0012502613615728367, + "compression/movement_sparsity/linear_layer_sparsity": 0.8079720238994661, + "compression/movement_sparsity/model_sparsity": 0.7802156992658412, + "compression_loss": 87.67911529541016, + "distillation_loss": 3.9715631008148193, + "epoch": 3.31, + "learning_rate": 3.7156006386775624e-05, + "loss": 90.8783, + "step": 3918, + "task_loss": 1.6224737167358398 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8217541155929845, + "compression/movement_sparsity/importance_threshold": -0.0012483869070702147, + "compression/movement_sparsity/linear_layer_sparsity": 0.8083338866147116, + "compression/movement_sparsity/model_sparsity": 0.7805651308835981, + "compression_loss": 87.70706939697266, + "distillation_loss": 5.6892499923706055, + "epoch": 3.31, + "learning_rate": 3.7151310228233304e-05, + "loss": 92.1404, + "step": 3919, + "task_loss": 3.364763021469116 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.822021484375, + "compression/movement_sparsity/importance_threshold": -0.0012465143270219414, + "compression/movement_sparsity/linear_layer_sparsity": 0.8085758637485462, + "compression/movement_sparsity/model_sparsity": 0.7807987953584768, + "compression_loss": 87.73517608642578, + "distillation_loss": 2.941068649291992, + "epoch": 3.31, + "learning_rate": 3.714661406969099e-05, + "loss": 91.6932, + "step": 3920, + "task_loss": 1.8125264644622803 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8222885856545046, + "compression/movement_sparsity/importance_threshold": -0.0012446436204903181, + "compression/movement_sparsity/linear_layer_sparsity": 0.8088810866675213, + "compression/movement_sparsity/model_sparsity": 0.7810935329312096, + "compression_loss": 87.76311492919922, + "distillation_loss": 2.9086735248565674, + "epoch": 3.31, + "learning_rate": 3.7141917911148683e-05, + "loss": 91.5697, + "step": 3921, + "task_loss": 1.0722906589508057 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8225554195653835, + "compression/movement_sparsity/importance_threshold": -0.0012427747865376478, + "compression/movement_sparsity/linear_layer_sparsity": 0.8091712612869402, + "compression/movement_sparsity/model_sparsity": 0.7813737391597698, + "compression_loss": 87.79106903076172, + "distillation_loss": 3.697805881500244, + "epoch": 3.32, + "learning_rate": 3.713722175260637e-05, + "loss": 91.5629, + "step": 3922, + "task_loss": 1.5074098110198975 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8228219862415216, + "compression/movement_sparsity/importance_threshold": -0.0012409078242262362, + "compression/movement_sparsity/linear_layer_sparsity": 0.8093744729517905, + "compression/movement_sparsity/model_sparsity": 0.7815699698787804, + "compression_loss": 87.81896209716797, + "distillation_loss": 5.6035661697387695, + "epoch": 3.32, + "learning_rate": 3.7132525594064056e-05, + "loss": 93.1085, + "step": 3923, + "task_loss": 2.62610125541687 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8230882858168044, + "compression/movement_sparsity/importance_threshold": -0.0012390427326183862, + "compression/movement_sparsity/linear_layer_sparsity": 0.8095981941849746, + "compression/movement_sparsity/model_sparsity": 0.7817860055993575, + "compression_loss": 87.84691619873047, + "distillation_loss": 2.762773275375366, + "epoch": 3.32, + "learning_rate": 3.712782943552174e-05, + "loss": 92.1615, + "step": 3924, + "task_loss": 2.843660831451416 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8233543184251164, + "compression/movement_sparsity/importance_threshold": -0.0012371795107764044, + "compression/movement_sparsity/linear_layer_sparsity": 0.8098053765976477, + "compression/movement_sparsity/model_sparsity": 0.7819860706587877, + "compression_loss": 87.87483215332031, + "distillation_loss": 4.545276641845703, + "epoch": 3.32, + "learning_rate": 3.7123133276979436e-05, + "loss": 92.016, + "step": 3925, + "task_loss": 2.5866806507110596 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8236200842003436, + "compression/movement_sparsity/importance_threshold": -0.0012353181577625903, + "compression/movement_sparsity/linear_layer_sparsity": 0.8099839052354918, + "compression/movement_sparsity/model_sparsity": 0.7821584662887036, + "compression_loss": 87.90270233154297, + "distillation_loss": 4.158599376678467, + "epoch": 3.32, + "learning_rate": 3.711843711843712e-05, + "loss": 92.2186, + "step": 3926, + "task_loss": 2.08990740776062 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8238855832763703, + "compression/movement_sparsity/importance_threshold": -0.001233458672639253, + "compression/movement_sparsity/linear_layer_sparsity": 0.8101314191133151, + "compression/movement_sparsity/model_sparsity": 0.7823009126110179, + "compression_loss": 87.93062591552734, + "distillation_loss": 2.797508955001831, + "epoch": 3.32, + "learning_rate": 3.711374095989481e-05, + "loss": 91.4538, + "step": 3927, + "task_loss": 1.3206008672714233 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8241508157870824, + "compression/movement_sparsity/importance_threshold": -0.0012316010544686914, + "compression/movement_sparsity/linear_layer_sparsity": 0.8103962310281724, + "compression/movement_sparsity/model_sparsity": 0.782556627421943, + "compression_loss": 87.95844268798828, + "distillation_loss": 4.378626823425293, + "epoch": 3.32, + "learning_rate": 3.7109044801352495e-05, + "loss": 91.7812, + "step": 3928, + "task_loss": 3.042166233062744 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8244157818663643, + "compression/movement_sparsity/importance_threshold": -0.0012297453023132135, + "compression/movement_sparsity/linear_layer_sparsity": 0.8106984848294062, + "compression/movement_sparsity/model_sparsity": 0.782848497875263, + "compression_loss": 87.98628997802734, + "distillation_loss": 5.075044631958008, + "epoch": 3.32, + "learning_rate": 3.710434864281018e-05, + "loss": 92.4558, + "step": 3929, + "task_loss": 2.5954647064208984 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8246804816481018, + "compression/movement_sparsity/importance_threshold": -0.0012278914152351206, + "compression/movement_sparsity/linear_layer_sparsity": 0.8108832498069238, + "compression/movement_sparsity/model_sparsity": 0.7830269156073995, + "compression_loss": 88.01397705078125, + "distillation_loss": 2.5974600315093994, + "epoch": 3.32, + "learning_rate": 3.7099652484267874e-05, + "loss": 91.7768, + "step": 3930, + "task_loss": 1.1997915506362915 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8249449152661795, + "compression/movement_sparsity/importance_threshold": -0.0012260393922967194, + "compression/movement_sparsity/linear_layer_sparsity": 0.8111075076276516, + "compression/movement_sparsity/model_sparsity": 0.7832434694820873, + "compression_loss": 88.04173278808594, + "distillation_loss": 3.8856089115142822, + "epoch": 3.32, + "learning_rate": 3.709495632572556e-05, + "loss": 92.229, + "step": 3931, + "task_loss": 2.5388429164886475 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8252090828544829, + "compression/movement_sparsity/importance_threshold": -0.0012241892325603118, + "compression/movement_sparsity/linear_layer_sparsity": 0.8112844503512001, + "compression/movement_sparsity/model_sparsity": 0.7834143336787426, + "compression_loss": 88.06937408447266, + "distillation_loss": 5.522714614868164, + "epoch": 3.32, + "learning_rate": 3.709026016718325e-05, + "loss": 93.1526, + "step": 3932, + "task_loss": 1.9521583318710327 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8254729845468971, + "compression/movement_sparsity/importance_threshold": -0.001222340935088201, + "compression/movement_sparsity/linear_layer_sparsity": 0.8114398580279983, + "compression/movement_sparsity/model_sparsity": 0.7835644026237528, + "compression_loss": 88.0970458984375, + "distillation_loss": 3.8998093605041504, + "epoch": 3.32, + "learning_rate": 3.708556400864093e-05, + "loss": 92.1745, + "step": 3933, + "task_loss": 1.6439554691314697 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8257366204773071, + "compression/movement_sparsity/importance_threshold": -0.0012204944989426935, + "compression/movement_sparsity/linear_layer_sparsity": 0.8115950629939467, + "compression/movement_sparsity/model_sparsity": 0.7837142758216545, + "compression_loss": 88.1246566772461, + "distillation_loss": 4.4539642333984375, + "epoch": 3.33, + "learning_rate": 3.708086785009862e-05, + "loss": 92.6833, + "step": 3934, + "task_loss": 2.3233323097229004 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8259999907795982, + "compression/movement_sparsity/importance_threshold": -0.0012186499231860914, + "compression/movement_sparsity/linear_layer_sparsity": 0.8117892242155614, + "compression/movement_sparsity/model_sparsity": 0.7839017670079971, + "compression_loss": 88.15223693847656, + "distillation_loss": 4.067781925201416, + "epoch": 3.33, + "learning_rate": 3.707617169155631e-05, + "loss": 92.1285, + "step": 3935, + "task_loss": 2.8161635398864746 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8262630955876553, + "compression/movement_sparsity/importance_threshold": -0.0012168072068806995, + "compression/movement_sparsity/linear_layer_sparsity": 0.812018859835893, + "compression/movement_sparsity/model_sparsity": 0.7841235139383284, + "compression_loss": 88.17977905273438, + "distillation_loss": 2.5938148498535156, + "epoch": 3.33, + "learning_rate": 3.707147553301399e-05, + "loss": 91.8138, + "step": 3936, + "task_loss": 1.9563353061676025 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8265259350353639, + "compression/movement_sparsity/importance_threshold": -0.0012149663490888208, + "compression/movement_sparsity/linear_layer_sparsity": 0.8122572954919396, + "compression/movement_sparsity/model_sparsity": 0.7843537585960759, + "compression_loss": 88.20732116699219, + "distillation_loss": 2.8357396125793457, + "epoch": 3.33, + "learning_rate": 3.7066779374471685e-05, + "loss": 92.0158, + "step": 3937, + "task_loss": 2.137120008468628 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8267885092566086, + "compression/movement_sparsity/importance_threshold": -0.0012131273488727619, + "compression/movement_sparsity/linear_layer_sparsity": 0.8124652052788386, + "compression/movement_sparsity/model_sparsity": 0.7845545260421896, + "compression_loss": 88.23487091064453, + "distillation_loss": 3.7070930004119873, + "epoch": 3.33, + "learning_rate": 3.706208321592937e-05, + "loss": 91.6346, + "step": 3938, + "task_loss": 1.8734791278839111 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8270508183852753, + "compression/movement_sparsity/importance_threshold": -0.001211290205294823, + "compression/movement_sparsity/linear_layer_sparsity": 0.8126002345531471, + "compression/movement_sparsity/model_sparsity": 0.784684916645527, + "compression_loss": 88.2623291015625, + "distillation_loss": 3.246004581451416, + "epoch": 3.33, + "learning_rate": 3.705738705738706e-05, + "loss": 91.6848, + "step": 3939, + "task_loss": 1.7143982648849487 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8273128625552484, + "compression/movement_sparsity/importance_threshold": -0.0012094549174173118, + "compression/movement_sparsity/linear_layer_sparsity": 0.8129158672704684, + "compression/movement_sparsity/model_sparsity": 0.7849897064080085, + "compression_loss": 88.28975677490234, + "distillation_loss": 3.7962069511413574, + "epoch": 3.33, + "learning_rate": 3.7052690898844744e-05, + "loss": 92.3236, + "step": 3940, + "task_loss": 2.2654871940612793 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8275746419004133, + "compression/movement_sparsity/importance_threshold": -0.0012076214843025302, + "compression/movement_sparsity/linear_layer_sparsity": 0.8130339046058958, + "compression/movement_sparsity/model_sparsity": 0.7851036887978385, + "compression_loss": 88.31714630126953, + "distillation_loss": 4.336456298828125, + "epoch": 3.33, + "learning_rate": 3.704799474030243e-05, + "loss": 92.2694, + "step": 3941, + "task_loss": 2.346320152282715 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8278361565546555, + "compression/movement_sparsity/importance_threshold": -0.0012057899050127814, + "compression/movement_sparsity/linear_layer_sparsity": 0.8132509125327009, + "compression/movement_sparsity/model_sparsity": 0.7853132418347635, + "compression_loss": 88.34456634521484, + "distillation_loss": 4.086164474487305, + "epoch": 3.33, + "learning_rate": 3.7043298581760124e-05, + "loss": 91.9828, + "step": 3942, + "task_loss": 2.5183610916137695 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8280974066518596, + "compression/movement_sparsity/importance_threshold": -0.0012039601786103709, + "compression/movement_sparsity/linear_layer_sparsity": 0.813489300492077, + "compression/movement_sparsity/model_sparsity": 0.7855434404343679, + "compression_loss": 88.37195587158203, + "distillation_loss": 4.417043685913086, + "epoch": 3.33, + "learning_rate": 3.703860242321781e-05, + "loss": 92.134, + "step": 3943, + "task_loss": 2.0718770027160645 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8283583923259111, + "compression/movement_sparsity/importance_threshold": -0.0012021323041576017, + "compression/movement_sparsity/linear_layer_sparsity": 0.81380179715331, + "compression/movement_sparsity/model_sparsity": 0.7858452018739355, + "compression_loss": 88.3992919921875, + "distillation_loss": 3.059758424758911, + "epoch": 3.33, + "learning_rate": 3.7033906264675496e-05, + "loss": 91.2914, + "step": 3944, + "task_loss": 2.761995315551758 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.828619113710695, + "compression/movement_sparsity/importance_threshold": -0.0012003062807167796, + "compression/movement_sparsity/linear_layer_sparsity": 0.8141207924859046, + "compression/movement_sparsity/model_sparsity": 0.786153238735511, + "compression_loss": 88.42668914794922, + "distillation_loss": 4.577442646026611, + "epoch": 3.33, + "learning_rate": 3.702921010613318e-05, + "loss": 93.0885, + "step": 3945, + "task_loss": 2.4807395935058594 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8288795709400965, + "compression/movement_sparsity/importance_threshold": -0.0011984821073502058, + "compression/movement_sparsity/linear_layer_sparsity": 0.814422867424624, + "compression/movement_sparsity/model_sparsity": 0.786444936470794, + "compression_loss": 88.4539566040039, + "distillation_loss": 3.5703845024108887, + "epoch": 3.34, + "learning_rate": 3.702451394759087e-05, + "loss": 92.1028, + "step": 3946, + "task_loss": 1.8918733596801758 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8291397641480005, + "compression/movement_sparsity/importance_threshold": -0.0011966597831201868, + "compression/movement_sparsity/linear_layer_sparsity": 0.8147567918150987, + "compression/movement_sparsity/model_sparsity": 0.7867673895311844, + "compression_loss": 88.48118591308594, + "distillation_loss": 3.9878056049346924, + "epoch": 3.34, + "learning_rate": 3.701981778904856e-05, + "loss": 92.5515, + "step": 3947, + "task_loss": 1.6891635656356812 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8293996934682926, + "compression/movement_sparsity/importance_threshold": -0.0011948393070890248, + "compression/movement_sparsity/linear_layer_sparsity": 0.8148781679174641, + "compression/movement_sparsity/model_sparsity": 0.786884595991037, + "compression_loss": 88.50838470458984, + "distillation_loss": 2.475827693939209, + "epoch": 3.34, + "learning_rate": 3.701512163050625e-05, + "loss": 92.3976, + "step": 3948, + "task_loss": 1.2996852397918701 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8296593590348574, + "compression/movement_sparsity/importance_threshold": -0.0011930206783190254, + "compression/movement_sparsity/linear_layer_sparsity": 0.8150161424611787, + "compression/movement_sparsity/model_sparsity": 0.7870178306847156, + "compression_loss": 88.5355453491211, + "distillation_loss": 4.750632286071777, + "epoch": 3.34, + "learning_rate": 3.7010425471963935e-05, + "loss": 92.9652, + "step": 3949, + "task_loss": 2.2748804092407227 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8299187609815805, + "compression/movement_sparsity/importance_threshold": -0.0011912038958724917, + "compression/movement_sparsity/linear_layer_sparsity": 0.8151420258989105, + "compression/movement_sparsity/model_sparsity": 0.7871393896390987, + "compression_loss": 88.56269836425781, + "distillation_loss": 4.765791893005371, + "epoch": 3.34, + "learning_rate": 3.700572931342162e-05, + "loss": 92.5154, + "step": 3950, + "task_loss": 1.50233793258667 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8301778994423469, + "compression/movement_sparsity/importance_threshold": -0.0011893889588117275, + "compression/movement_sparsity/linear_layer_sparsity": 0.8153292591791289, + "compression/movement_sparsity/model_sparsity": 0.7873201908801445, + "compression_loss": 88.58982849121094, + "distillation_loss": 3.5545244216918945, + "epoch": 3.34, + "learning_rate": 3.7001033154879314e-05, + "loss": 91.8625, + "step": 3951, + "task_loss": 1.6280970573425293 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8304367745510416, + "compression/movement_sparsity/importance_threshold": -0.0011875758661990367, + "compression/movement_sparsity/linear_layer_sparsity": 0.8155258648551089, + "compression/movement_sparsity/model_sparsity": 0.787510042546325, + "compression_loss": 88.6168441772461, + "distillation_loss": 5.532506465911865, + "epoch": 3.34, + "learning_rate": 3.6996336996337e-05, + "loss": 92.99, + "step": 3952, + "task_loss": 2.5115251541137695 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8306953864415499, + "compression/movement_sparsity/importance_threshold": -0.0011857646170967233, + "compression/movement_sparsity/linear_layer_sparsity": 0.8157396055599818, + "compression/movement_sparsity/model_sparsity": 0.7877164406004421, + "compression_loss": 88.64398956298828, + "distillation_loss": 4.099217414855957, + "epoch": 3.34, + "learning_rate": 3.699164083779468e-05, + "loss": 92.4702, + "step": 3953, + "task_loss": 1.847812533378601 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8309537352477568, + "compression/movement_sparsity/importance_threshold": -0.001183955210567092, + "compression/movement_sparsity/linear_layer_sparsity": 0.8158791183213214, + "compression/movement_sparsity/model_sparsity": 0.7878511606692382, + "compression_loss": 88.67095947265625, + "distillation_loss": 3.4225873947143555, + "epoch": 3.34, + "learning_rate": 3.698694467925237e-05, + "loss": 92.3054, + "step": 3954, + "task_loss": 0.9704834818840027 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8312118211035477, + "compression/movement_sparsity/importance_threshold": -0.001182147645672445, + "compression/movement_sparsity/linear_layer_sparsity": 0.816088578250013, + "compression/movement_sparsity/model_sparsity": 0.7880534250050051, + "compression_loss": 88.69805908203125, + "distillation_loss": 3.9661622047424316, + "epoch": 3.34, + "learning_rate": 3.698224852071006e-05, + "loss": 92.423, + "step": 3955, + "task_loss": 1.703989863395691 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8314696441428073, + "compression/movement_sparsity/importance_threshold": -0.0011803419214750904, + "compression/movement_sparsity/linear_layer_sparsity": 0.8162609301690218, + "compression/movement_sparsity/model_sparsity": 0.7882198561053794, + "compression_loss": 88.72504425048828, + "distillation_loss": 3.625500202178955, + "epoch": 3.34, + "learning_rate": 3.697755236216775e-05, + "loss": 92.7499, + "step": 3956, + "task_loss": 2.0203709602355957 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8317272044994214, + "compression/movement_sparsity/importance_threshold": -0.0011785380370373261, + "compression/movement_sparsity/linear_layer_sparsity": 0.8164100418853082, + "compression/movement_sparsity/model_sparsity": 0.7883638453754901, + "compression_loss": 88.75201416015625, + "distillation_loss": 3.6596343517303467, + "epoch": 3.34, + "learning_rate": 3.697285620362544e-05, + "loss": 92.4173, + "step": 3957, + "task_loss": 1.8166301250457764 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8319845023072744, + "compression/movement_sparsity/importance_threshold": -0.0011767359914214614, + "compression/movement_sparsity/linear_layer_sparsity": 0.8166303170340455, + "compression/movement_sparsity/model_sparsity": 0.7885765533952227, + "compression_loss": 88.77897644042969, + "distillation_loss": 3.64263916015625, + "epoch": 3.35, + "learning_rate": 3.6968160045083125e-05, + "loss": 93.0686, + "step": 3958, + "task_loss": 1.6418927907943726 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.832241537700252, + "compression/movement_sparsity/importance_threshold": -0.0011749357836897964, + "compression/movement_sparsity/linear_layer_sparsity": 0.8167915914013205, + "compression/movement_sparsity/model_sparsity": 0.7887322874918439, + "compression_loss": 88.80581665039062, + "distillation_loss": 5.406153678894043, + "epoch": 3.35, + "learning_rate": 3.696346388654081e-05, + "loss": 93.4399, + "step": 3959, + "task_loss": 2.6660962104797363 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8324983108122389, + "compression/movement_sparsity/importance_threshold": -0.0011731374129046396, + "compression/movement_sparsity/linear_layer_sparsity": 0.8171294388429474, + "compression/movement_sparsity/model_sparsity": 0.7890585288345106, + "compression_loss": 88.83274841308594, + "distillation_loss": 4.4663472175598145, + "epoch": 3.35, + "learning_rate": 3.69587677279985e-05, + "loss": 92.892, + "step": 3960, + "task_loss": 2.2665445804595947 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8327548217771207, + "compression/movement_sparsity/importance_threshold": -0.0011713408781282896, + "compression/movement_sparsity/linear_layer_sparsity": 0.8173594918091461, + "compression/movement_sparsity/model_sparsity": 0.7892806787735946, + "compression_loss": 88.8595962524414, + "distillation_loss": 4.842007160186768, + "epoch": 3.35, + "learning_rate": 3.695407156945619e-05, + "loss": 92.7, + "step": 3961, + "task_loss": 2.6616220474243164 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8330110707287821, + "compression/movement_sparsity/importance_threshold": -0.0011695461784230555, + "compression/movement_sparsity/linear_layer_sparsity": 0.8175645040233096, + "compression/movement_sparsity/model_sparsity": 0.7894786481875101, + "compression_loss": 88.88639068603516, + "distillation_loss": 4.151176452636719, + "epoch": 3.35, + "learning_rate": 3.694937541091387e-05, + "loss": 92.9752, + "step": 3962, + "task_loss": 3.4933841228485107 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8332670578011085, + "compression/movement_sparsity/importance_threshold": -0.0011677533128512376, + "compression/movement_sparsity/linear_layer_sparsity": 0.8177687173182413, + "compression/movement_sparsity/model_sparsity": 0.7896758461275275, + "compression_loss": 88.91317749023438, + "distillation_loss": 3.627025604248047, + "epoch": 3.35, + "learning_rate": 3.6944679252371564e-05, + "loss": 92.526, + "step": 3963, + "task_loss": 2.206144094467163 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8335227831279851, + "compression/movement_sparsity/importance_threshold": -0.001165962280475141, + "compression/movement_sparsity/linear_layer_sparsity": 0.8180042554015525, + "compression/movement_sparsity/model_sparsity": 0.789903292753077, + "compression_loss": 88.93992614746094, + "distillation_loss": 4.077983379364014, + "epoch": 3.35, + "learning_rate": 3.693998309382925e-05, + "loss": 93.9092, + "step": 3964, + "task_loss": 3.2774624824523926 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8337782468432968, + "compression/movement_sparsity/importance_threshold": -0.00116417308035707, + "compression/movement_sparsity/linear_layer_sparsity": 0.8182186757839807, + "compression/movement_sparsity/model_sparsity": 0.7901103471357345, + "compression_loss": 88.96666717529297, + "distillation_loss": 5.144759178161621, + "epoch": 3.35, + "learning_rate": 3.6935286935286937e-05, + "loss": 93.5206, + "step": 3965, + "task_loss": 3.137341260910034 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.834033449080929, + "compression/movement_sparsity/importance_threshold": -0.0011623857115593272, + "compression/movement_sparsity/linear_layer_sparsity": 0.8184172847201235, + "compression/movement_sparsity/model_sparsity": 0.7903021332439283, + "compression_loss": 88.99337768554688, + "distillation_loss": 3.665590763092041, + "epoch": 3.35, + "learning_rate": 3.693059077674462e-05, + "loss": 92.6423, + "step": 3966, + "task_loss": 2.5177559852600098 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8342883899747666, + "compression/movement_sparsity/importance_threshold": -0.0011606001731442198, + "compression/movement_sparsity/linear_layer_sparsity": 0.8185267366548532, + "compression/movement_sparsity/model_sparsity": 0.7904078251679865, + "compression_loss": 89.02003479003906, + "distillation_loss": 3.1894383430480957, + "epoch": 3.35, + "learning_rate": 3.692589461820231e-05, + "loss": 92.2815, + "step": 3967, + "task_loss": 1.9407908916473389 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8345430696586951, + "compression/movement_sparsity/importance_threshold": -0.0011588164641740473, + "compression/movement_sparsity/linear_layer_sparsity": 0.8187003525356313, + "compression/movement_sparsity/model_sparsity": 0.790575476809155, + "compression_loss": 89.04669189453125, + "distillation_loss": 6.0429487228393555, + "epoch": 3.35, + "learning_rate": 3.692119845966e-05, + "loss": 93.5953, + "step": 3968, + "task_loss": 3.563713788986206 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.834797488266599, + "compression/movement_sparsity/importance_threshold": -0.0011570345837111189, + "compression/movement_sparsity/linear_layer_sparsity": 0.8189081192325186, + "compression/movement_sparsity/model_sparsity": 0.790776106080839, + "compression_loss": 89.07333374023438, + "distillation_loss": 5.140173435211182, + "epoch": 3.35, + "learning_rate": 3.691650230111769e-05, + "loss": 93.3749, + "step": 3969, + "task_loss": 3.711853265762329 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8350516459323641, + "compression/movement_sparsity/importance_threshold": -0.0011552545308177332, + "compression/movement_sparsity/linear_layer_sparsity": 0.8190605697157431, + "compression/movement_sparsity/model_sparsity": 0.7909233194209723, + "compression_loss": 89.09992218017578, + "distillation_loss": 4.1165266036987305, + "epoch": 3.36, + "learning_rate": 3.6911806142575375e-05, + "loss": 93.4851, + "step": 3970, + "task_loss": 1.076065182685852 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8353055427898752, + "compression/movement_sparsity/importance_threshold": -0.0011534763045561985, + "compression/movement_sparsity/linear_layer_sparsity": 0.819471083034943, + "compression/movement_sparsity/model_sparsity": 0.7913197303447709, + "compression_loss": 89.12654113769531, + "distillation_loss": 4.678439617156982, + "epoch": 3.36, + "learning_rate": 3.690710998403306e-05, + "loss": 93.1489, + "step": 3971, + "task_loss": 2.504371166229248 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8355591789730175, + "compression/movement_sparsity/importance_threshold": -0.001151699903988817, + "compression/movement_sparsity/linear_layer_sparsity": 0.8196287086069214, + "compression/movement_sparsity/model_sparsity": 0.7914719409934389, + "compression_loss": 89.15310668945312, + "distillation_loss": 4.914548873901367, + "epoch": 3.36, + "learning_rate": 3.690241382549075e-05, + "loss": 93.4603, + "step": 3972, + "task_loss": 2.8925068378448486 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8358125546156762, + "compression/movement_sparsity/importance_threshold": -0.0011499253281778915, + "compression/movement_sparsity/linear_layer_sparsity": 0.8198363679863, + "compression/movement_sparsity/model_sparsity": 0.7916724666343008, + "compression_loss": 89.17958068847656, + "distillation_loss": 6.253751754760742, + "epoch": 3.36, + "learning_rate": 3.689771766694844e-05, + "loss": 93.8565, + "step": 3973, + "task_loss": 3.179927110671997 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8360656698517362, + "compression/movement_sparsity/importance_threshold": -0.0011481525761857288, + "compression/movement_sparsity/linear_layer_sparsity": 0.8200846172323111, + "compression/movement_sparsity/model_sparsity": 0.7919121877550074, + "compression_loss": 89.20613098144531, + "distillation_loss": 5.055649757385254, + "epoch": 3.36, + "learning_rate": 3.689302150840613e-05, + "loss": 93.4686, + "step": 3974, + "task_loss": 2.6819663047790527 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8363185248150832, + "compression/movement_sparsity/importance_threshold": -0.0011463816470746293, + "compression/movement_sparsity/linear_layer_sparsity": 0.8201867596522798, + "compression/movement_sparsity/model_sparsity": 0.7920108212686234, + "compression_loss": 89.2325668334961, + "distillation_loss": 3.6938886642456055, + "epoch": 3.36, + "learning_rate": 3.6888325349863814e-05, + "loss": 93.5699, + "step": 3975, + "task_loss": 2.969996929168701 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8365711196396016, + "compression/movement_sparsity/importance_threshold": -0.001144612539906901, + "compression/movement_sparsity/linear_layer_sparsity": 0.8203856786167812, + "compression/movement_sparsity/model_sparsity": 0.7922029067547479, + "compression_loss": 89.25902557373047, + "distillation_loss": 4.012026786804199, + "epoch": 3.36, + "learning_rate": 3.68836291913215e-05, + "loss": 92.7972, + "step": 3976, + "task_loss": 2.1305065155029297 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8368234544591773, + "compression/movement_sparsity/importance_threshold": -0.0011428452537448437, + "compression/movement_sparsity/linear_layer_sparsity": 0.8207295239147291, + "compression/movement_sparsity/model_sparsity": 0.7925349399089193, + "compression_loss": 89.28539276123047, + "distillation_loss": 3.202460289001465, + "epoch": 3.36, + "learning_rate": 3.6878933032779186e-05, + "loss": 93.0733, + "step": 3977, + "task_loss": 1.9267168045043945 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8370755294076948, + "compression/movement_sparsity/importance_threshold": -0.0011410797876507655, + "compression/movement_sparsity/linear_layer_sparsity": 0.8209687107933368, + "compression/movement_sparsity/model_sparsity": 0.792765909982422, + "compression_loss": 89.31175231933594, + "distillation_loss": 3.961599826812744, + "epoch": 3.36, + "learning_rate": 3.687423687423688e-05, + "loss": 93.6401, + "step": 3978, + "task_loss": 2.6213057041168213 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8373273446190396, + "compression/movement_sparsity/importance_threshold": -0.0011393161406869669, + "compression/movement_sparsity/linear_layer_sparsity": 0.8211236534275972, + "compression/movement_sparsity/model_sparsity": 0.7929155298605363, + "compression_loss": 89.33808898925781, + "distillation_loss": 5.167476654052734, + "epoch": 3.36, + "learning_rate": 3.686954071569456e-05, + "loss": 93.9582, + "step": 3979, + "task_loss": 2.4182441234588623 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8375789002270966, + "compression/movement_sparsity/importance_threshold": -0.0011375543119157552, + "compression/movement_sparsity/linear_layer_sparsity": 0.8212501449978784, + "compression/movement_sparsity/model_sparsity": 0.7930376760562448, + "compression_loss": 89.36444091796875, + "distillation_loss": 3.967257022857666, + "epoch": 3.36, + "learning_rate": 3.686484455715225e-05, + "loss": 93.3218, + "step": 3980, + "task_loss": 1.9096957445144653 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8378301963657511, + "compression/movement_sparsity/importance_threshold": -0.0011357943003994318, + "compression/movement_sparsity/linear_layer_sparsity": 0.8214273739014503, + "compression/movement_sparsity/model_sparsity": 0.793208816601759, + "compression_loss": 89.39071655273438, + "distillation_loss": 5.810724258422852, + "epoch": 3.36, + "learning_rate": 3.686014839860994e-05, + "loss": 94.4424, + "step": 3981, + "task_loss": 2.1504719257354736 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8380812331688884, + "compression/movement_sparsity/importance_threshold": -0.0011340361052003005, + "compression/movement_sparsity/linear_layer_sparsity": 0.8216943440906497, + "compression/movement_sparsity/model_sparsity": 0.793466615543663, + "compression_loss": 89.41697692871094, + "distillation_loss": 6.392257213592529, + "epoch": 3.37, + "learning_rate": 3.685545224006763e-05, + "loss": 94.1482, + "step": 3982, + "task_loss": 3.4417433738708496 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8383320107703933, + "compression/movement_sparsity/importance_threshold": -0.001132279725380666, + "compression/movement_sparsity/linear_layer_sparsity": 0.8218652889578774, + "compression/movement_sparsity/model_sparsity": 0.7936316879288136, + "compression_loss": 89.4432601928711, + "distillation_loss": 5.075407028198242, + "epoch": 3.37, + "learning_rate": 3.685075608152531e-05, + "loss": 94.5565, + "step": 3983, + "task_loss": 1.8612474203109741 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8385825293041512, + "compression/movement_sparsity/importance_threshold": -0.0011305251600028333, + "compression/movement_sparsity/linear_layer_sparsity": 0.8219223818725179, + "compression/movement_sparsity/model_sparsity": 0.7936868195261978, + "compression_loss": 89.46949005126953, + "distillation_loss": 4.514585018157959, + "epoch": 3.37, + "learning_rate": 3.6846059922983e-05, + "loss": 93.2395, + "step": 3984, + "task_loss": 3.5320873260498047 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.838832788904047, + "compression/movement_sparsity/importance_threshold": -0.0011287724081291061, + "compression/movement_sparsity/linear_layer_sparsity": 0.8221175208758787, + "compression/movement_sparsity/model_sparsity": 0.7938752549044754, + "compression_loss": 89.49565887451172, + "distillation_loss": 3.862992763519287, + "epoch": 3.37, + "learning_rate": 3.684136376444069e-05, + "loss": 93.8631, + "step": 3985, + "task_loss": 2.990511178970337 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8390827897039662, + "compression/movement_sparsity/importance_threshold": -0.0011270214688217867, + "compression/movement_sparsity/linear_layer_sparsity": 0.8222458606921436, + "compression/movement_sparsity/model_sparsity": 0.7939991858532321, + "compression_loss": 89.52183532714844, + "distillation_loss": 5.26555871963501, + "epoch": 3.37, + "learning_rate": 3.683666760589838e-05, + "loss": 93.784, + "step": 3986, + "task_loss": 3.657965660095215 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8393325318377935, + "compression/movement_sparsity/importance_threshold": -0.0011252723411431806, + "compression/movement_sparsity/linear_layer_sparsity": 0.8223731034849858, + "compression/movement_sparsity/model_sparsity": 0.7941220574646957, + "compression_loss": 89.54790496826172, + "distillation_loss": 5.825346946716309, + "epoch": 3.37, + "learning_rate": 3.683197144735606e-05, + "loss": 95.0281, + "step": 3987, + "task_loss": 2.6280922889709473 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8395820154394146, + "compression/movement_sparsity/importance_threshold": -0.001123525024155591, + "compression/movement_sparsity/linear_layer_sparsity": 0.8224808383395759, + "compression/movement_sparsity/model_sparsity": 0.7942260912955993, + "compression_loss": 89.57408905029297, + "distillation_loss": 4.264525413513184, + "epoch": 3.37, + "learning_rate": 3.682727528881375e-05, + "loss": 93.5911, + "step": 3988, + "task_loss": 1.6111263036727905 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.839831240642714, + "compression/movement_sparsity/importance_threshold": -0.0011217795169213233, + "compression/movement_sparsity/linear_layer_sparsity": 0.8226116703068765, + "compression/movement_sparsity/model_sparsity": 0.794352428782337, + "compression_loss": 89.60014343261719, + "distillation_loss": 4.176291465759277, + "epoch": 3.37, + "learning_rate": 3.682257913027144e-05, + "loss": 93.6193, + "step": 3989, + "task_loss": 2.489731550216675 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8400802075815771, + "compression/movement_sparsity/importance_threshold": -0.0011200358185026807, + "compression/movement_sparsity/linear_layer_sparsity": 0.8228172906535894, + "compression/movement_sparsity/model_sparsity": 0.7945509854375781, + "compression_loss": 89.626220703125, + "distillation_loss": 5.954404830932617, + "epoch": 3.37, + "learning_rate": 3.681788297172913e-05, + "loss": 94.4639, + "step": 3990, + "task_loss": 2.7733876705169678 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8403289163898893, + "compression/movement_sparsity/importance_threshold": -0.0011182939279619653, + "compression/movement_sparsity/linear_layer_sparsity": 0.8230489533824189, + "compression/movement_sparsity/model_sparsity": 0.7947746898389945, + "compression_loss": 89.65226745605469, + "distillation_loss": 5.770582675933838, + "epoch": 3.37, + "learning_rate": 3.6813186813186815e-05, + "loss": 94.5703, + "step": 3991, + "task_loss": 2.4439711570739746 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8405773672015353, + "compression/movement_sparsity/importance_threshold": -0.0011165538443614845, + "compression/movement_sparsity/linear_layer_sparsity": 0.8231905528730948, + "compression/movement_sparsity/model_sparsity": 0.7949114249515546, + "compression_loss": 89.67829895019531, + "distillation_loss": 4.157776355743408, + "epoch": 3.37, + "learning_rate": 3.68084906546445e-05, + "loss": 93.804, + "step": 3992, + "task_loss": 2.197361946105957 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8408255601504008, + "compression/movement_sparsity/importance_threshold": -0.0011148155667635388, + "compression/movement_sparsity/linear_layer_sparsity": 0.8234133082487004, + "compression/movement_sparsity/model_sparsity": 0.7951265279947324, + "compression_loss": 89.70433044433594, + "distillation_loss": 6.018702507019043, + "epoch": 3.38, + "learning_rate": 3.680379449610219e-05, + "loss": 94.104, + "step": 3993, + "task_loss": 3.0476107597351074 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8410734953703702, + "compression/movement_sparsity/importance_threshold": -0.0011130790942304354, + "compression/movement_sparsity/linear_layer_sparsity": 0.8236222673623512, + "compression/movement_sparsity/model_sparsity": 0.7953283087199959, + "compression_loss": 89.73027801513672, + "distillation_loss": 4.065033912658691, + "epoch": 3.38, + "learning_rate": 3.679909833755988e-05, + "loss": 93.8287, + "step": 3994, + "task_loss": 1.5901000499725342 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8413211729953295, + "compression/movement_sparsity/importance_threshold": -0.0011113444258244749, + "compression/movement_sparsity/linear_layer_sparsity": 0.8238350660579808, + "compression/movement_sparsity/model_sparsity": 0.7955337971257853, + "compression_loss": 89.7562484741211, + "distillation_loss": 4.049321174621582, + "epoch": 3.38, + "learning_rate": 3.679440217901757e-05, + "loss": 93.8094, + "step": 3995, + "task_loss": 1.7693833112716675 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.841568593159163, + "compression/movement_sparsity/importance_threshold": -0.0011096115606079654, + "compression/movement_sparsity/linear_layer_sparsity": 0.8239321884033749, + "compression/movement_sparsity/model_sparsity": 0.7956275830198318, + "compression_loss": 89.78217315673828, + "distillation_loss": 3.4872629642486572, + "epoch": 3.38, + "learning_rate": 3.6789706020475254e-05, + "loss": 93.201, + "step": 3996, + "task_loss": 1.6915392875671387 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8418157559957565, + "compression/movement_sparsity/importance_threshold": -0.0011078804976432056, + "compression/movement_sparsity/linear_layer_sparsity": 0.8240835776356799, + "compression/movement_sparsity/model_sparsity": 0.7957737715662794, + "compression_loss": 89.80810546875, + "distillation_loss": 5.2877197265625, + "epoch": 3.38, + "learning_rate": 3.678500986193294e-05, + "loss": 94.4685, + "step": 3997, + "task_loss": 2.872746706008911 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8420626616389947, + "compression/movement_sparsity/importance_threshold": -0.0011061512359925056, + "compression/movement_sparsity/linear_layer_sparsity": 0.8242061223064736, + "compression/movement_sparsity/model_sparsity": 0.7958921064506399, + "compression_loss": 89.83394622802734, + "distillation_loss": 3.74898099899292, + "epoch": 3.38, + "learning_rate": 3.6780313703390626e-05, + "loss": 93.6162, + "step": 3998, + "task_loss": 2.2747697830200195 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.842309310222763, + "compression/movement_sparsity/importance_threshold": -0.0011044237747181648, + "compression/movement_sparsity/linear_layer_sparsity": 0.8243634020775906, + "compression/movement_sparsity/model_sparsity": 0.7960439831777698, + "compression_loss": 89.85975646972656, + "distillation_loss": 6.204662322998047, + "epoch": 3.38, + "learning_rate": 3.677561754484832e-05, + "loss": 93.4307, + "step": 3999, + "task_loss": 2.6401376724243164 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8425557018809464, + "compression/movement_sparsity/importance_threshold": -0.0011026981128824881, + "compression/movement_sparsity/linear_layer_sparsity": 0.8245461757191132, + "compression/movement_sparsity/model_sparsity": 0.7962204779824286, + "compression_loss": 89.88553619384766, + "distillation_loss": 4.542688369750977, + "epoch": 3.38, + "learning_rate": 3.6770921386306e-05, + "loss": 93.7523, + "step": 4000, + "task_loss": 2.1997807025909424 + }, + { + "epoch": 3.38, + "eval_accuracy": 0.5992475247524752, + "eval_loss": 93.37529754638672, + "eval_runtime": 311.0661, + "eval_samples_per_second": 81.172, + "eval_steps_per_second": 0.637, + "step": 4000 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8428018367474303, + "compression/movement_sparsity/importance_threshold": -0.0011009742495477802, + "compression/movement_sparsity/linear_layer_sparsity": 0.8247485288438937, + "compression/movement_sparsity/model_sparsity": 0.7964158796548619, + "compression_loss": 89.91133117675781, + "distillation_loss": 4.378180503845215, + "epoch": 3.38, + "learning_rate": 3.676622522776369e-05, + "loss": 93.3666, + "step": 4001, + "task_loss": 1.712186336517334 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8430477149560993, + "compression/movement_sparsity/importance_threshold": -0.001099252183776346, + "compression/movement_sparsity/linear_layer_sparsity": 0.8249674565616882, + "compression/movement_sparsity/model_sparsity": 0.7966272865320497, + "compression_loss": 89.93704986572266, + "distillation_loss": 5.9553751945495605, + "epoch": 3.38, + "learning_rate": 3.676152906922138e-05, + "loss": 94.2253, + "step": 4002, + "task_loss": 4.005069732666016 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8432933366408389, + "compression/movement_sparsity/importance_threshold": -0.0010975319146304882, + "compression/movement_sparsity/linear_layer_sparsity": 0.8251431114751321, + "compression/movement_sparsity/model_sparsity": 0.7967969071588391, + "compression_loss": 89.9627456665039, + "distillation_loss": 3.6137094497680664, + "epoch": 3.38, + "learning_rate": 3.6756832910679065e-05, + "loss": 94.4025, + "step": 4003, + "task_loss": 2.1843485832214355 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8435387019355345, + "compression/movement_sparsity/importance_threshold": -0.0010958134411725102, + "compression/movement_sparsity/linear_layer_sparsity": 0.8253232021789365, + "compression/movement_sparsity/model_sparsity": 0.796970811192944, + "compression_loss": 89.98843383789062, + "distillation_loss": 4.9405717849731445, + "epoch": 3.38, + "learning_rate": 3.675213675213676e-05, + "loss": 94.1285, + "step": 4004, + "task_loss": 2.347391366958618 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8437838109740706, + "compression/movement_sparsity/importance_threshold": -0.0010940967624647182, + "compression/movement_sparsity/linear_layer_sparsity": 0.8254915118051167, + "compression/movement_sparsity/model_sparsity": 0.7971333388656839, + "compression_loss": 90.01409912109375, + "distillation_loss": 4.22064208984375, + "epoch": 3.39, + "learning_rate": 3.674744059359444e-05, + "loss": 93.6736, + "step": 4005, + "task_loss": 2.274905204772949 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.844028663890333, + "compression/movement_sparsity/importance_threshold": -0.0010923818775694129, + "compression/movement_sparsity/linear_layer_sparsity": 0.8257831292488194, + "compression/movement_sparsity/model_sparsity": 0.7974149383530753, + "compression_loss": 90.03973388671875, + "distillation_loss": 4.685396671295166, + "epoch": 3.39, + "learning_rate": 3.674274443505213e-05, + "loss": 93.5858, + "step": 4006, + "task_loss": 1.545034408569336 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8442732608182064, + "compression/movement_sparsity/importance_threshold": -0.0010906687855489006, + "compression/movement_sparsity/linear_layer_sparsity": 0.8260030466725277, + "compression/movement_sparsity/model_sparsity": 0.797627300936734, + "compression_loss": 90.06533813476562, + "distillation_loss": 3.726595163345337, + "epoch": 3.39, + "learning_rate": 3.673804827650982e-05, + "loss": 94.2731, + "step": 4007, + "task_loss": 2.017807960510254 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8445176018915762, + "compression/movement_sparsity/importance_threshold": -0.0010889574854654844, + "compression/movement_sparsity/linear_layer_sparsity": 0.8260976649427182, + "compression/movement_sparsity/model_sparsity": 0.7977186687782637, + "compression_loss": 90.09091186523438, + "distillation_loss": 4.626942157745361, + "epoch": 3.39, + "learning_rate": 3.6733352117967503e-05, + "loss": 95.0854, + "step": 4008, + "task_loss": 2.6714439392089844 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8447616872443272, + "compression/movement_sparsity/importance_threshold": -0.00108724797638147, + "compression/movement_sparsity/linear_layer_sparsity": 0.8263725050825572, + "compression/movement_sparsity/model_sparsity": 0.797984067313792, + "compression_loss": 90.11647033691406, + "distillation_loss": 5.864928245544434, + "epoch": 3.39, + "learning_rate": 3.672865595942519e-05, + "loss": 93.9198, + "step": 4009, + "task_loss": 3.1061229705810547 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.845005517010345, + "compression/movement_sparsity/importance_threshold": -0.0010855402573591586, + "compression/movement_sparsity/linear_layer_sparsity": 0.826501011837169, + "compression/movement_sparsity/model_sparsity": 0.7981081594660497, + "compression_loss": 90.1419448852539, + "distillation_loss": 4.828771114349365, + "epoch": 3.39, + "learning_rate": 3.6723959800882876e-05, + "loss": 94.4144, + "step": 4010, + "task_loss": 1.9714725017547607 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8452490913235142, + "compression/movement_sparsity/importance_threshold": -0.0010838343274608576, + "compression/movement_sparsity/linear_layer_sparsity": 0.8266318676528048, + "compression/movement_sparsity/model_sparsity": 0.7982345199818591, + "compression_loss": 90.16741943359375, + "distillation_loss": 4.570655822753906, + "epoch": 3.39, + "learning_rate": 3.671926364234057e-05, + "loss": 94.5963, + "step": 4011, + "task_loss": 3.6325502395629883 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8454924103177204, + "compression/movement_sparsity/importance_threshold": -0.0010821301857488683, + "compression/movement_sparsity/linear_layer_sparsity": 0.8268041837993108, + "compression/movement_sparsity/model_sparsity": 0.798400916538626, + "compression_loss": 90.19284057617188, + "distillation_loss": 3.2912402153015137, + "epoch": 3.39, + "learning_rate": 3.6714567483798256e-05, + "loss": 94.4459, + "step": 4012, + "task_loss": 2.136094570159912 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8457354741268487, + "compression/movement_sparsity/importance_threshold": -0.0010804278312854937, + "compression/movement_sparsity/linear_layer_sparsity": 0.8271148918353984, + "compression/movement_sparsity/model_sparsity": 0.7987009507978243, + "compression_loss": 90.21829986572266, + "distillation_loss": 5.338934898376465, + "epoch": 3.39, + "learning_rate": 3.670987132525594e-05, + "loss": 93.4635, + "step": 4013, + "task_loss": 2.87656831741333 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8459782828847839, + "compression/movement_sparsity/importance_threshold": -0.0010787272631330412, + "compression/movement_sparsity/linear_layer_sparsity": 0.827284465423348, + "compression/movement_sparsity/model_sparsity": 0.7988646990113585, + "compression_loss": 90.24361419677734, + "distillation_loss": 3.7629055976867676, + "epoch": 3.39, + "learning_rate": 3.670517516671363e-05, + "loss": 94.7441, + "step": 4014, + "task_loss": 2.2653439044952393 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8462208367254116, + "compression/movement_sparsity/importance_threshold": -0.0010770284803538113, + "compression/movement_sparsity/linear_layer_sparsity": 0.8274868543206315, + "compression/movement_sparsity/model_sparsity": 0.7990601352273993, + "compression_loss": 90.26899719238281, + "distillation_loss": 3.008218765258789, + "epoch": 3.39, + "learning_rate": 3.6700479008171315e-05, + "loss": 93.7245, + "step": 4015, + "task_loss": 2.6759941577911377 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8464631357826164, + "compression/movement_sparsity/importance_threshold": -0.0010753314820101112, + "compression/movement_sparsity/linear_layer_sparsity": 0.8278032978813521, + "compression/movement_sparsity/model_sparsity": 0.7993657079783147, + "compression_loss": 90.29431915283203, + "distillation_loss": 5.495811462402344, + "epoch": 3.39, + "learning_rate": 3.669578284962901e-05, + "loss": 94.644, + "step": 4016, + "task_loss": 3.655524492263794 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8467051801902841, + "compression/movement_sparsity/importance_threshold": -0.0010736362671642423, + "compression/movement_sparsity/linear_layer_sparsity": 0.8279653473195234, + "compression/movement_sparsity/model_sparsity": 0.7995221905197626, + "compression_loss": 90.31966400146484, + "distillation_loss": 4.061117172241211, + "epoch": 3.4, + "learning_rate": 3.669108669108669e-05, + "loss": 94.937, + "step": 4017, + "task_loss": 3.019972801208496 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8469469700822992, + "compression/movement_sparsity/importance_threshold": -0.0010719428348785111, + "compression/movement_sparsity/linear_layer_sparsity": 0.8280224640824992, + "compression/movement_sparsity/model_sparsity": 0.7995773451462185, + "compression_loss": 90.34500885009766, + "distillation_loss": 4.476384162902832, + "epoch": 3.4, + "learning_rate": 3.668639053254438e-05, + "loss": 94.8489, + "step": 4018, + "task_loss": 2.9321508407592773 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8471885055925473, + "compression/movement_sparsity/importance_threshold": -0.001070251184215219, + "compression/movement_sparsity/linear_layer_sparsity": 0.828236598284904, + "compression/movement_sparsity/model_sparsity": 0.7997841231800168, + "compression_loss": 90.37025451660156, + "distillation_loss": 4.159713268280029, + "epoch": 3.4, + "learning_rate": 3.668169437400207e-05, + "loss": 94.3754, + "step": 4019, + "task_loss": 2.629488945007324 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8474297868549132, + "compression/movement_sparsity/importance_threshold": -0.0010685613142366714, + "compression/movement_sparsity/linear_layer_sparsity": 0.8284583878029311, + "compression/movement_sparsity/model_sparsity": 0.7999982935457952, + "compression_loss": 90.39547729492188, + "distillation_loss": 4.1260576248168945, + "epoch": 3.4, + "learning_rate": 3.667699821545976e-05, + "loss": 94.1859, + "step": 4020, + "task_loss": 1.9738210439682007 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8476708140032823, + "compression/movement_sparsity/importance_threshold": -0.0010668732240051724, + "compression/movement_sparsity/linear_layer_sparsity": 0.8286815724485715, + "compression/movement_sparsity/model_sparsity": 0.8002138111122616, + "compression_loss": 90.42064666748047, + "distillation_loss": 3.8680572509765625, + "epoch": 3.4, + "learning_rate": 3.6672302056917446e-05, + "loss": 94.9111, + "step": 4021, + "task_loss": 2.539458990097046 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8479115871715397, + "compression/movement_sparsity/importance_threshold": -0.0010651869125830249, + "compression/movement_sparsity/linear_layer_sparsity": 0.8288818269198482, + "compression/movement_sparsity/model_sparsity": 0.8004071862263952, + "compression_loss": 90.44585418701172, + "distillation_loss": 3.463296890258789, + "epoch": 3.4, + "learning_rate": 3.666760589837513e-05, + "loss": 94.6061, + "step": 4022, + "task_loss": 1.7755612134933472 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8481521064935703, + "compression/movement_sparsity/importance_threshold": -0.0010635023790325345, + "compression/movement_sparsity/linear_layer_sparsity": 0.8291031394711699, + "compression/movement_sparsity/model_sparsity": 0.8006208960107418, + "compression_loss": 90.47101593017578, + "distillation_loss": 2.89973783493042, + "epoch": 3.4, + "learning_rate": 3.666290973983282e-05, + "loss": 93.7827, + "step": 4023, + "task_loss": 1.9414618015289307 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8483923721032597, + "compression/movement_sparsity/importance_threshold": -0.0010618196224160027, + "compression/movement_sparsity/linear_layer_sparsity": 0.8293222102789759, + "compression/movement_sparsity/model_sparsity": 0.8008324410623592, + "compression_loss": 90.49610900878906, + "distillation_loss": 4.550585746765137, + "epoch": 3.4, + "learning_rate": 3.6658213581290505e-05, + "loss": 94.8838, + "step": 4024, + "task_loss": 3.320986747741699 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8486323841344925, + "compression/movement_sparsity/importance_threshold": -0.0010601386417957367, + "compression/movement_sparsity/linear_layer_sparsity": 0.8294404622494208, + "compression/movement_sparsity/model_sparsity": 0.8009466307138337, + "compression_loss": 90.52119445800781, + "distillation_loss": 3.6590003967285156, + "epoch": 3.4, + "learning_rate": 3.66535174227482e-05, + "loss": 94.7263, + "step": 4025, + "task_loss": 1.0616800785064697 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8488721427211543, + "compression/movement_sparsity/importance_threshold": -0.001058459436234038, + "compression/movement_sparsity/linear_layer_sparsity": 0.8296288044772292, + "compression/movement_sparsity/model_sparsity": 0.8011285028067084, + "compression_loss": 90.5462646484375, + "distillation_loss": 3.410371780395508, + "epoch": 3.4, + "learning_rate": 3.664882126420588e-05, + "loss": 94.5795, + "step": 4026, + "task_loss": 2.033132791519165 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8491116479971299, + "compression/movement_sparsity/importance_threshold": -0.0010567820047932119, + "compression/movement_sparsity/linear_layer_sparsity": 0.8297351442042058, + "compression/movement_sparsity/model_sparsity": 0.8012311894369241, + "compression_loss": 90.5712890625, + "distillation_loss": 4.2272820472717285, + "epoch": 3.4, + "learning_rate": 3.664412510566357e-05, + "loss": 95.4442, + "step": 4027, + "task_loss": 3.7124855518341064 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8493509000963049, + "compression/movement_sparsity/importance_threshold": -0.00105510634653556, + "compression/movement_sparsity/linear_layer_sparsity": 0.8298981237274529, + "compression/movement_sparsity/model_sparsity": 0.8013885701121639, + "compression_loss": 90.59623718261719, + "distillation_loss": 3.327826976776123, + "epoch": 3.4, + "learning_rate": 3.663942894712126e-05, + "loss": 94.3846, + "step": 4028, + "task_loss": 1.7476540803909302 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8495898991525638, + "compression/movement_sparsity/importance_threshold": -0.0010534324605233894, + "compression/movement_sparsity/linear_layer_sparsity": 0.8300068125154537, + "compression/movement_sparsity/model_sparsity": 0.8014935251059312, + "compression_loss": 90.62123107910156, + "distillation_loss": 4.16901969909668, + "epoch": 3.41, + "learning_rate": 3.6634732788578944e-05, + "loss": 94.4001, + "step": 4029, + "task_loss": 2.4603734016418457 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8498286452997922, + "compression/movement_sparsity/importance_threshold": -0.0010517603458190024, + "compression/movement_sparsity/linear_layer_sparsity": 0.8302250009348547, + "compression/movement_sparsity/model_sparsity": 0.8017042180818996, + "compression_loss": 90.64610290527344, + "distillation_loss": 4.509100914001465, + "epoch": 3.41, + "learning_rate": 3.663003663003663e-05, + "loss": 94.8197, + "step": 4030, + "task_loss": 2.9440319538116455 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.850067138671875, + "compression/movement_sparsity/importance_threshold": -0.0010500900014847048, + "compression/movement_sparsity/linear_layer_sparsity": 0.8305486705411627, + "compression/movement_sparsity/model_sparsity": 0.8020167686415066, + "compression_loss": 90.67100524902344, + "distillation_loss": 5.5777997970581055, + "epoch": 3.41, + "learning_rate": 3.6625340471494316e-05, + "loss": 95.1193, + "step": 4031, + "task_loss": 2.953101873397827 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8503053794026975, + "compression/movement_sparsity/importance_threshold": -0.0010484214265827985, + "compression/movement_sparsity/linear_layer_sparsity": 0.8307230137961665, + "compression/movement_sparsity/model_sparsity": 0.8021851226693586, + "compression_loss": 90.69589233398438, + "distillation_loss": 4.493691921234131, + "epoch": 3.41, + "learning_rate": 3.662064431295201e-05, + "loss": 94.8027, + "step": 4032, + "task_loss": 3.440406322479248 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8505433676261449, + "compression/movement_sparsity/importance_threshold": -0.0010467546201755875, + "compression/movement_sparsity/linear_layer_sparsity": 0.8308071626471728, + "compression/movement_sparsity/model_sparsity": 0.8022663807484607, + "compression_loss": 90.72071838378906, + "distillation_loss": 4.116029262542725, + "epoch": 3.41, + "learning_rate": 3.6615948154409696e-05, + "loss": 95.1318, + "step": 4033, + "task_loss": 1.9901171922683716 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.850781103476102, + "compression/movement_sparsity/importance_threshold": -0.0010450895813253775, + "compression/movement_sparsity/linear_layer_sparsity": 0.8310351288840353, + "compression/movement_sparsity/model_sparsity": 0.8024865156437807, + "compression_loss": 90.74551391601562, + "distillation_loss": 3.526010036468506, + "epoch": 3.41, + "learning_rate": 3.661125199586738e-05, + "loss": 95.1513, + "step": 4034, + "task_loss": 3.065296173095703 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8510185870864545, + "compression/movement_sparsity/importance_threshold": -0.0010434263090944697, + "compression/movement_sparsity/linear_layer_sparsity": 0.8311413612935031, + "compression/movement_sparsity/model_sparsity": 0.8025890986431743, + "compression_loss": 90.77027130126953, + "distillation_loss": 3.5066988468170166, + "epoch": 3.41, + "learning_rate": 3.660655583732507e-05, + "loss": 94.3973, + "step": 4035, + "task_loss": 2.283299446105957 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8512558185910869, + "compression/movement_sparsity/importance_threshold": -0.0010417648025451715, + "compression/movement_sparsity/linear_layer_sparsity": 0.8312788946430152, + "compression/movement_sparsity/model_sparsity": 0.8027219072990286, + "compression_loss": 90.7950210571289, + "distillation_loss": 4.981856346130371, + "epoch": 3.41, + "learning_rate": 3.6601859678782755e-05, + "loss": 94.8677, + "step": 4036, + "task_loss": 3.1459848880767822 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8514927981238849, + "compression/movement_sparsity/importance_threshold": -0.0010401050607397842, + "compression/movement_sparsity/linear_layer_sparsity": 0.8314839307055139, + "compression/movement_sparsity/model_sparsity": 0.8029198997420157, + "compression_loss": 90.81974029541016, + "distillation_loss": 2.9590530395507812, + "epoch": 3.41, + "learning_rate": 3.659716352024045e-05, + "loss": 94.5023, + "step": 4037, + "task_loss": 1.815351963043213 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8517295258187332, + "compression/movement_sparsity/importance_threshold": -0.0010384470827406135, + "compression/movement_sparsity/linear_layer_sparsity": 0.8317092855496641, + "compression/movement_sparsity/model_sparsity": 0.8031375129539967, + "compression_loss": 90.84449005126953, + "distillation_loss": 5.350484371185303, + "epoch": 3.41, + "learning_rate": 3.6592467361698134e-05, + "loss": 95.3318, + "step": 4038, + "task_loss": 3.2526042461395264 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8519660018095174, + "compression/movement_sparsity/importance_threshold": -0.0010367908676099605, + "compression/movement_sparsity/linear_layer_sparsity": 0.8318864071357271, + "compression/movement_sparsity/model_sparsity": 0.8033085498686888, + "compression_loss": 90.8691177368164, + "distillation_loss": 3.8340611457824707, + "epoch": 3.41, + "learning_rate": 3.658777120315582e-05, + "loss": 94.924, + "step": 4039, + "task_loss": 2.467369794845581 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8522022262301221, + "compression/movement_sparsity/importance_threshold": -0.0010351364144101337, + "compression/movement_sparsity/linear_layer_sparsity": 0.8320084271431449, + "compression/movement_sparsity/model_sparsity": 0.8034263781134744, + "compression_loss": 90.89376831054688, + "distillation_loss": 3.892963409423828, + "epoch": 3.41, + "learning_rate": 3.658307504461351e-05, + "loss": 94.4309, + "step": 4040, + "task_loss": 1.9918533563613892 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.852438199214433, + "compression/movement_sparsity/importance_threshold": -0.0010334837222034333, + "compression/movement_sparsity/linear_layer_sparsity": 0.8321930013339804, + "compression/movement_sparsity/model_sparsity": 0.8036046116130381, + "compression_loss": 90.91842651367188, + "distillation_loss": 4.417028903961182, + "epoch": 3.42, + "learning_rate": 3.657837888607119e-05, + "loss": 94.5218, + "step": 4041, + "task_loss": 2.805022716522217 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8526739208963346, + "compression/movement_sparsity/importance_threshold": -0.001031832790052166, + "compression/movement_sparsity/linear_layer_sparsity": 0.8324261307354293, + "compression/movement_sparsity/model_sparsity": 0.8038297323023571, + "compression_loss": 90.9429702758789, + "distillation_loss": 4.495639801025391, + "epoch": 3.42, + "learning_rate": 3.6573682727528886e-05, + "loss": 94.9987, + "step": 4042, + "task_loss": 1.9722598791122437 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8529093914097126, + "compression/movement_sparsity/importance_threshold": -0.0010301836170186337, + "compression/movement_sparsity/linear_layer_sparsity": 0.8326292350827709, + "compression/movement_sparsity/model_sparsity": 0.8040258593905455, + "compression_loss": 90.967529296875, + "distillation_loss": 4.092824935913086, + "epoch": 3.42, + "learning_rate": 3.6568986568986566e-05, + "loss": 95.24, + "step": 4043, + "task_loss": 2.367692232131958 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.853144610888452, + "compression/movement_sparsity/importance_threshold": -0.0010285362021651406, + "compression/movement_sparsity/linear_layer_sparsity": 0.832686375694082, + "compression/movement_sparsity/model_sparsity": 0.8040810370460729, + "compression_loss": 90.99202728271484, + "distillation_loss": 4.864141464233398, + "epoch": 3.42, + "learning_rate": 3.656429041044426e-05, + "loss": 95.4717, + "step": 4044, + "task_loss": 2.0035738945007324 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8533795794664376, + "compression/movement_sparsity/importance_threshold": -0.001026890544553993, + "compression/movement_sparsity/linear_layer_sparsity": 0.8328888719088742, + "compression/movement_sparsity/model_sparsity": 0.8042765768929359, + "compression_loss": 91.01655578613281, + "distillation_loss": 4.6106977462768555, + "epoch": 3.42, + "learning_rate": 3.6559594251901945e-05, + "loss": 95.2744, + "step": 4045, + "task_loss": 2.313793897628784 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8536142972775551, + "compression/movement_sparsity/importance_threshold": -0.0010252466432474914, + "compression/movement_sparsity/linear_layer_sparsity": 0.8329748213091934, + "compression/movement_sparsity/model_sparsity": 0.8043595736669429, + "compression_loss": 91.04108428955078, + "distillation_loss": 4.225437164306641, + "epoch": 3.42, + "learning_rate": 3.655489809335964e-05, + "loss": 94.9898, + "step": 4046, + "task_loss": 3.497863531112671 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8538487644556892, + "compression/movement_sparsity/importance_threshold": -0.0010236044973079414, + "compression/movement_sparsity/linear_layer_sparsity": 0.833165619915535, + "compression/movement_sparsity/model_sparsity": 0.8045438177541914, + "compression_loss": 91.06556701660156, + "distillation_loss": 4.941924095153809, + "epoch": 3.42, + "learning_rate": 3.655020193481732e-05, + "loss": 95.3346, + "step": 4047, + "task_loss": 2.1771092414855957 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8540829811347255, + "compression/movement_sparsity/importance_threshold": -0.0010219641057976469, + "compression/movement_sparsity/linear_layer_sparsity": 0.8334211906004744, + "compression/movement_sparsity/model_sparsity": 0.8047906087998757, + "compression_loss": 91.08998107910156, + "distillation_loss": 5.057939052581787, + "epoch": 3.42, + "learning_rate": 3.6545505776275004e-05, + "loss": 95.9447, + "step": 4048, + "task_loss": 3.3483328819274902 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8543169474485485, + "compression/movement_sparsity/importance_threshold": -0.0010203254677789136, + "compression/movement_sparsity/linear_layer_sparsity": 0.833557758092408, + "compression/movement_sparsity/model_sparsity": 0.8049224847783306, + "compression_loss": 91.11447143554688, + "distillation_loss": 6.482856750488281, + "epoch": 3.42, + "learning_rate": 3.65408096177327e-05, + "loss": 95.3204, + "step": 4049, + "task_loss": 3.5287153720855713 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8545506635310439, + "compression/movement_sparsity/importance_threshold": -0.001018688582314041, + "compression/movement_sparsity/linear_layer_sparsity": 0.8337770435352314, + "compression/movement_sparsity/model_sparsity": 0.8051342370915923, + "compression_loss": 91.13887023925781, + "distillation_loss": 3.861699104309082, + "epoch": 3.42, + "learning_rate": 3.6536113459190384e-05, + "loss": 95.8628, + "step": 4050, + "task_loss": 2.765523910522461 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8547841295160965, + "compression/movement_sparsity/importance_threshold": -0.0010170534484653382, + "compression/movement_sparsity/linear_layer_sparsity": 0.8339206582102378, + "compression/movement_sparsity/model_sparsity": 0.8052729181607017, + "compression_loss": 91.16327667236328, + "distillation_loss": 4.048453330993652, + "epoch": 3.42, + "learning_rate": 3.653141730064808e-05, + "loss": 95.7282, + "step": 4051, + "task_loss": 3.1812658309936523 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8550173455375915, + "compression/movement_sparsity/importance_threshold": -0.0010154200652951074, + "compression/movement_sparsity/linear_layer_sparsity": 0.8340673016238236, + "compression/movement_sparsity/model_sparsity": 0.8054145239219029, + "compression_loss": 91.18772888183594, + "distillation_loss": 3.323960304260254, + "epoch": 3.42, + "learning_rate": 3.6526721142105757e-05, + "loss": 94.4332, + "step": 4052, + "task_loss": 2.219477415084839 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8552503117294142, + "compression/movement_sparsity/importance_threshold": -0.0010137884318656507, + "compression/movement_sparsity/linear_layer_sparsity": 0.8342724688521662, + "compression/movement_sparsity/model_sparsity": 0.8056126430247839, + "compression_loss": 91.21202850341797, + "distillation_loss": 4.079426288604736, + "epoch": 3.43, + "learning_rate": 3.652202498356345e-05, + "loss": 94.9351, + "step": 4053, + "task_loss": 2.078655958175659 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8554830282254496, + "compression/movement_sparsity/importance_threshold": -0.0010121585472392747, + "compression/movement_sparsity/linear_layer_sparsity": 0.8344894767789713, + "compression/movement_sparsity/model_sparsity": 0.8058221960617087, + "compression_loss": 91.2363052368164, + "distillation_loss": 5.734855651855469, + "epoch": 3.43, + "learning_rate": 3.6517328825021136e-05, + "loss": 96.0173, + "step": 4054, + "task_loss": 2.78306245803833 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8557154951595831, + "compression/movement_sparsity/importance_threshold": -0.0010105304104782806, + "compression/movement_sparsity/linear_layer_sparsity": 0.8346943935997937, + "compression/movement_sparsity/model_sparsity": 0.8060200733593378, + "compression_loss": 91.26063537597656, + "distillation_loss": 3.758881092071533, + "epoch": 3.43, + "learning_rate": 3.651263266647882e-05, + "loss": 95.4428, + "step": 4055, + "task_loss": 2.015864133834839 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8559477126656994, + "compression/movement_sparsity/importance_threshold": -0.0010089040206449759, + "compression/movement_sparsity/linear_layer_sparsity": 0.8349092790247595, + "compression/movement_sparsity/model_sparsity": 0.8062275768088912, + "compression_loss": 91.28484344482422, + "distillation_loss": 3.075552463531494, + "epoch": 3.43, + "learning_rate": 3.650793650793651e-05, + "loss": 95.2995, + "step": 4056, + "task_loss": 1.9622286558151245 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.856179680877684, + "compression/movement_sparsity/importance_threshold": -0.00100727937680166, + "compression/movement_sparsity/linear_layer_sparsity": 0.8350782564043273, + "compression/movement_sparsity/model_sparsity": 0.8063907492956357, + "compression_loss": 91.30905151367188, + "distillation_loss": 3.255462646484375, + "epoch": 3.43, + "learning_rate": 3.6503240349394195e-05, + "loss": 94.978, + "step": 4057, + "task_loss": 1.6003787517547607 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8564113999294217, + "compression/movement_sparsity/importance_threshold": -0.001005656478010642, + "compression/movement_sparsity/linear_layer_sparsity": 0.8352217876101602, + "compression/movement_sparsity/model_sparsity": 0.8065293497629946, + "compression_loss": 91.3332290649414, + "distillation_loss": 5.117107391357422, + "epoch": 3.43, + "learning_rate": 3.649854419085189e-05, + "loss": 95.4985, + "step": 4058, + "task_loss": 3.1110661029815674 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8566428699547981, + "compression/movement_sparsity/importance_threshold": -0.0010040353233342217, + "compression/movement_sparsity/linear_layer_sparsity": 0.8354285526769661, + "compression/movement_sparsity/model_sparsity": 0.806729011813672, + "compression_loss": 91.3573226928711, + "distillation_loss": 5.257266044616699, + "epoch": 3.43, + "learning_rate": 3.6493848032309575e-05, + "loss": 95.7893, + "step": 4059, + "task_loss": 3.6231908798217773 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8568740910876979, + "compression/movement_sparsity/importance_threshold": -0.0010024159118347071, + "compression/movement_sparsity/linear_layer_sparsity": 0.8355146093947942, + "compression/movement_sparsity/model_sparsity": 0.8068121122185011, + "compression_loss": 91.38145446777344, + "distillation_loss": 3.8260996341705322, + "epoch": 3.43, + "learning_rate": 3.648915187376726e-05, + "loss": 94.9752, + "step": 4060, + "task_loss": 1.9420841932296753 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8571050634620064, + "compression/movement_sparsity/importance_threshold": -0.0010007982425743979, + "compression/movement_sparsity/linear_layer_sparsity": 0.8355958129763943, + "compression/movement_sparsity/model_sparsity": 0.806890526207262, + "compression_loss": 91.40552520751953, + "distillation_loss": 6.000916481018066, + "epoch": 3.43, + "learning_rate": 3.648445571522495e-05, + "loss": 95.8092, + "step": 4061, + "task_loss": 1.9992434978485107 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.857335787211609, + "compression/movement_sparsity/importance_threshold": -0.0009991823146156004, + "compression/movement_sparsity/linear_layer_sparsity": 0.8357542374676045, + "compression/movement_sparsity/model_sparsity": 0.8070435083298282, + "compression_loss": 91.42961120605469, + "distillation_loss": 5.9295501708984375, + "epoch": 3.43, + "learning_rate": 3.6479759556682634e-05, + "loss": 95.7548, + "step": 4062, + "task_loss": 2.9101176261901855 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8575662624703904, + "compression/movement_sparsity/importance_threshold": -0.0009975681270206187, + "compression/movement_sparsity/linear_layer_sparsity": 0.8359264701449368, + "compression/movement_sparsity/model_sparsity": 0.8072098242848446, + "compression_loss": 91.45365142822266, + "distillation_loss": 4.540140151977539, + "epoch": 3.43, + "learning_rate": 3.647506339814033e-05, + "loss": 95.8558, + "step": 4063, + "task_loss": 3.298372745513916 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8577964893722361, + "compression/movement_sparsity/importance_threshold": -0.0009959556788517558, + "compression/movement_sparsity/linear_layer_sparsity": 0.8360303892658834, + "compression/movement_sparsity/model_sparsity": 0.807310173464294, + "compression_loss": 91.4776611328125, + "distillation_loss": 5.1201324462890625, + "epoch": 3.44, + "learning_rate": 3.6470367239598006e-05, + "loss": 95.8437, + "step": 4064, + "task_loss": 2.2403018474578857 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.858026468051031, + "compression/movement_sparsity/importance_threshold": -0.0009943449691713156, + "compression/movement_sparsity/linear_layer_sparsity": 0.8362778038201599, + "compression/movement_sparsity/model_sparsity": 0.8075490885674949, + "compression_loss": 91.50166320800781, + "distillation_loss": 4.3699870109558105, + "epoch": 3.44, + "learning_rate": 3.64656710810557e-05, + "loss": 95.6369, + "step": 4065, + "task_loss": 2.469433069229126 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8582561986406604, + "compression/movement_sparsity/importance_threshold": -0.0009927359970416036, + "compression/movement_sparsity/linear_layer_sparsity": 0.8364019701777521, + "compression/movement_sparsity/model_sparsity": 0.8076689894287234, + "compression_loss": 91.525634765625, + "distillation_loss": 5.020072937011719, + "epoch": 3.44, + "learning_rate": 3.6460974922513386e-05, + "loss": 95.528, + "step": 4066, + "task_loss": 3.185976505279541 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8584856812750092, + "compression/movement_sparsity/importance_threshold": -0.000991128761524923, + "compression/movement_sparsity/linear_layer_sparsity": 0.8365673703070292, + "compression/movement_sparsity/model_sparsity": 0.8078287075547296, + "compression_loss": 91.54962158203125, + "distillation_loss": 4.111250400543213, + "epoch": 3.44, + "learning_rate": 3.645627876397107e-05, + "loss": 96.1807, + "step": 4067, + "task_loss": 3.4712131023406982 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.858714916087963, + "compression/movement_sparsity/importance_threshold": -0.0009895232616835758, + "compression/movement_sparsity/linear_layer_sparsity": 0.8366727799489302, + "compression/movement_sparsity/model_sparsity": 0.8079304960511533, + "compression_loss": 91.57353210449219, + "distillation_loss": 4.172990798950195, + "epoch": 3.44, + "learning_rate": 3.6451582605428765e-05, + "loss": 95.7808, + "step": 4068, + "task_loss": 2.247328042984009 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8589439032134064, + "compression/movement_sparsity/importance_threshold": -0.0009879194965798704, + "compression/movement_sparsity/linear_layer_sparsity": 0.8368362364388826, + "compression/movement_sparsity/model_sparsity": 0.8080883373078248, + "compression_loss": 91.5974349975586, + "distillation_loss": 5.47062873840332, + "epoch": 3.44, + "learning_rate": 3.6446886446886445e-05, + "loss": 95.2017, + "step": 4069, + "task_loss": 2.432814836502075 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8591726427852251, + "compression/movement_sparsity/importance_threshold": -0.0009863174652761054, + "compression/movement_sparsity/linear_layer_sparsity": 0.8370138826883217, + "compression/movement_sparsity/model_sparsity": 0.8082598808620919, + "compression_loss": 91.62129974365234, + "distillation_loss": 4.852380752563477, + "epoch": 3.44, + "learning_rate": 3.644219028834414e-05, + "loss": 95.8271, + "step": 4070, + "task_loss": 2.304025411605835 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8594011349373036, + "compression/movement_sparsity/importance_threshold": -0.000984717166834589, + "compression/movement_sparsity/linear_layer_sparsity": 0.8372518294534954, + "compression/movement_sparsity/model_sparsity": 0.808489653423872, + "compression_loss": 91.6451416015625, + "distillation_loss": 8.230361938476562, + "epoch": 3.44, + "learning_rate": 3.6437494129801824e-05, + "loss": 96.9659, + "step": 4071, + "task_loss": 5.1191277503967285 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8596293798035276, + "compression/movement_sparsity/importance_threshold": -0.0009831186003176236, + "compression/movement_sparsity/linear_layer_sparsity": 0.8373328184000781, + "compression/movement_sparsity/model_sparsity": 0.8085678601509885, + "compression_loss": 91.66901397705078, + "distillation_loss": 4.487608909606934, + "epoch": 3.44, + "learning_rate": 3.643279797125951e-05, + "loss": 96.3496, + "step": 4072, + "task_loss": 2.969597101211548 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8598573775177818, + "compression/movement_sparsity/importance_threshold": -0.0009815217647875145, + "compression/movement_sparsity/linear_layer_sparsity": 0.8375153416340803, + "compression/movement_sparsity/model_sparsity": 0.8087441131503955, + "compression_loss": 91.69283294677734, + "distillation_loss": 5.0012526512146, + "epoch": 3.44, + "learning_rate": 3.64281018127172e-05, + "loss": 96.6891, + "step": 4073, + "task_loss": 2.7415690422058105 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8600851282139517, + "compression/movement_sparsity/importance_threshold": -0.0009799266593065632, + "compression/movement_sparsity/linear_layer_sparsity": 0.8376961477879429, + "compression/movement_sparsity/model_sparsity": 0.8089187080566481, + "compression_loss": 91.71662902832031, + "distillation_loss": 3.738043785095215, + "epoch": 3.44, + "learning_rate": 3.642340565417488e-05, + "loss": 96.1812, + "step": 4074, + "task_loss": 2.1282966136932373 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8603126320259223, + "compression/movement_sparsity/importance_threshold": -0.0009783332829370744, + "compression/movement_sparsity/linear_layer_sparsity": 0.8379011123054357, + "compression/movement_sparsity/model_sparsity": 0.8091166314124205, + "compression_loss": 91.74044036865234, + "distillation_loss": 4.446313858032227, + "epoch": 3.44, + "learning_rate": 3.6418709495632576e-05, + "loss": 96.1875, + "step": 4075, + "task_loss": 2.660594940185547 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8605398890875786, + "compression/movement_sparsity/importance_threshold": -0.0009767416347413547, + "compression/movement_sparsity/linear_layer_sparsity": 0.8379707852169321, + "compression/movement_sparsity/model_sparsity": 0.809183910845068, + "compression_loss": 91.76414489746094, + "distillation_loss": 2.522665500640869, + "epoch": 3.45, + "learning_rate": 3.641401333709026e-05, + "loss": 95.9157, + "step": 4076, + "task_loss": 2.1633448600769043 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8607668995328062, + "compression/movement_sparsity/importance_threshold": -0.0009751517137817035, + "compression/movement_sparsity/linear_layer_sparsity": 0.8381572553504217, + "compression/movement_sparsity/model_sparsity": 0.809363975155823, + "compression_loss": 91.78778839111328, + "distillation_loss": 5.533056259155273, + "epoch": 3.45, + "learning_rate": 3.640931717854795e-05, + "loss": 95.9745, + "step": 4077, + "task_loss": 3.6567044258117676 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8609936634954896, + "compression/movement_sparsity/importance_threshold": -0.00097356351912043, + "compression/movement_sparsity/linear_layer_sparsity": 0.8382955041499919, + "compression/movement_sparsity/model_sparsity": 0.8094974746838249, + "compression_loss": 91.81153869628906, + "distillation_loss": 3.513753890991211, + "epoch": 3.45, + "learning_rate": 3.6404621020005635e-05, + "loss": 95.7425, + "step": 4078, + "task_loss": 1.9147870540618896 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8612201811095145, + "compression/movement_sparsity/importance_threshold": -0.0009719770498198329, + "compression/movement_sparsity/linear_layer_sparsity": 0.8384048606913804, + "compression/movement_sparsity/model_sparsity": 0.8096030744915967, + "compression_loss": 91.83518981933594, + "distillation_loss": 3.5299763679504395, + "epoch": 3.45, + "learning_rate": 3.639992486146332e-05, + "loss": 95.3543, + "step": 4079, + "task_loss": 1.5012915134429932 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8614464525087656, + "compression/movement_sparsity/importance_threshold": -0.0009703923049422195, + "compression/movement_sparsity/linear_layer_sparsity": 0.8386323380373698, + "compression/movement_sparsity/model_sparsity": 0.8098227372909491, + "compression_loss": 91.8587646484375, + "distillation_loss": 3.9649181365966797, + "epoch": 3.45, + "learning_rate": 3.6395228702921015e-05, + "loss": 96.1386, + "step": 4080, + "task_loss": 2.0053443908691406 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8616724778271284, + "compression/movement_sparsity/importance_threshold": -0.0009688092835498929, + "compression/movement_sparsity/linear_layer_sparsity": 0.8387931592862746, + "compression/movement_sparsity/model_sparsity": 0.8099780338352102, + "compression_loss": 91.88239288330078, + "distillation_loss": 4.441703796386719, + "epoch": 3.45, + "learning_rate": 3.63905325443787e-05, + "loss": 96.5573, + "step": 4081, + "task_loss": 3.1673943996429443 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8618982571984878, + "compression/movement_sparsity/importance_threshold": -0.0009672279847051579, + "compression/movement_sparsity/linear_layer_sparsity": 0.8390496004354515, + "compression/movement_sparsity/model_sparsity": 0.8102256654420075, + "compression_loss": 91.90599060058594, + "distillation_loss": 5.3949761390686035, + "epoch": 3.45, + "learning_rate": 3.638583638583639e-05, + "loss": 96.3088, + "step": 4082, + "task_loss": 4.461790561676025 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.862123790756729, + "compression/movement_sparsity/importance_threshold": -0.0009656484074703183, + "compression/movement_sparsity/linear_layer_sparsity": 0.8392365475356466, + "compression/movement_sparsity/model_sparsity": 0.8104061903341944, + "compression_loss": 91.92950439453125, + "distillation_loss": 3.968522071838379, + "epoch": 3.45, + "learning_rate": 3.6381140227294074e-05, + "loss": 95.3029, + "step": 4083, + "task_loss": 1.5124552249908447 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8623490786357372, + "compression/movement_sparsity/importance_threshold": -0.0009640705509076763, + "compression/movement_sparsity/linear_layer_sparsity": 0.8393858619627829, + "compression/movement_sparsity/model_sparsity": 0.8105503753514136, + "compression_loss": 91.95293426513672, + "distillation_loss": 3.84421706199646, + "epoch": 3.45, + "learning_rate": 3.637644406875177e-05, + "loss": 95.8285, + "step": 4084, + "task_loss": 0.8956551551818848 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8625741209693975, + "compression/movement_sparsity/importance_threshold": -0.0009624944140795384, + "compression/movement_sparsity/linear_layer_sparsity": 0.8394532692824284, + "compression/movement_sparsity/model_sparsity": 0.8106154670222601, + "compression_loss": 91.9764175415039, + "distillation_loss": 5.489009380340576, + "epoch": 3.45, + "learning_rate": 3.637174791020945e-05, + "loss": 96.0301, + "step": 4085, + "task_loss": 2.8080403804779053 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8627989178915951, + "compression/movement_sparsity/importance_threshold": -0.000960919996048205, + "compression/movement_sparsity/linear_layer_sparsity": 0.8396178227958033, + "compression/movement_sparsity/model_sparsity": 0.8107743676162248, + "compression_loss": 91.99991607666016, + "distillation_loss": 3.9707391262054443, + "epoch": 3.45, + "learning_rate": 3.636705175166713e-05, + "loss": 96.3037, + "step": 4086, + "task_loss": 2.288635730743408 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8630234695362149, + "compression/movement_sparsity/importance_threshold": -0.0009593472958759853, + "compression/movement_sparsity/linear_layer_sparsity": 0.8397292779906957, + "compression/movement_sparsity/model_sparsity": 0.8108819939822963, + "compression_loss": 92.02336120605469, + "distillation_loss": 5.043789386749268, + "epoch": 3.45, + "learning_rate": 3.6362355593124826e-05, + "loss": 96.7005, + "step": 4087, + "task_loss": 2.462646245956421 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8632477760371424, + "compression/movement_sparsity/importance_threshold": -0.0009577763126251788, + "compression/movement_sparsity/linear_layer_sparsity": 0.8399765733032959, + "compression/movement_sparsity/model_sparsity": 0.8111207939401394, + "compression_loss": 92.04676055908203, + "distillation_loss": 5.533036231994629, + "epoch": 3.46, + "learning_rate": 3.635765943458251e-05, + "loss": 96.5232, + "step": 4088, + "task_loss": 3.296221971511841 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8634718375282625, + "compression/movement_sparsity/importance_threshold": -0.0009562070453580911, + "compression/movement_sparsity/linear_layer_sparsity": 0.8401702217857023, + "compression/movement_sparsity/model_sparsity": 0.8113077900014427, + "compression_loss": 92.07014465332031, + "distillation_loss": 5.672863960266113, + "epoch": 3.46, + "learning_rate": 3.6352963276040205e-05, + "loss": 96.7002, + "step": 4089, + "task_loss": 3.5039267539978027 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8636956541434606, + "compression/movement_sparsity/importance_threshold": -0.0009546394931370253, + "compression/movement_sparsity/linear_layer_sparsity": 0.8403799798185848, + "compression/movement_sparsity/model_sparsity": 0.8115103422006045, + "compression_loss": 92.093505859375, + "distillation_loss": 3.9402217864990234, + "epoch": 3.46, + "learning_rate": 3.6348267117497885e-05, + "loss": 96.8754, + "step": 4090, + "task_loss": 2.8740739822387695 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8639192260166213, + "compression/movement_sparsity/importance_threshold": -0.0009530736550242887, + "compression/movement_sparsity/linear_layer_sparsity": 0.8405805681665552, + "compression/movement_sparsity/model_sparsity": 0.8117040397217402, + "compression_loss": 92.11685943603516, + "distillation_loss": 5.592130661010742, + "epoch": 3.46, + "learning_rate": 3.634357095895558e-05, + "loss": 96.7769, + "step": 4091, + "task_loss": 2.790256977081299 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8641425532816303, + "compression/movement_sparsity/importance_threshold": -0.000951509530082181, + "compression/movement_sparsity/linear_layer_sparsity": 0.8408162493398781, + "compression/movement_sparsity/model_sparsity": 0.8119316245217193, + "compression_loss": 92.14022827148438, + "distillation_loss": 3.55947208404541, + "epoch": 3.46, + "learning_rate": 3.6338874800413264e-05, + "loss": 96.5903, + "step": 4092, + "task_loss": 1.6302986145019531 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8643656360723726, + "compression/movement_sparsity/importance_threshold": -0.0009499471173730076, + "compression/movement_sparsity/linear_layer_sparsity": 0.8410552931284743, + "compression/movement_sparsity/model_sparsity": 0.8121624564207924, + "compression_loss": 92.16349792480469, + "distillation_loss": 5.414210319519043, + "epoch": 3.46, + "learning_rate": 3.633417864187095e-05, + "loss": 96.9089, + "step": 4093, + "task_loss": 2.384058713912964 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.864588474522733, + "compression/movement_sparsity/importance_threshold": -0.0009483864159590743, + "compression/movement_sparsity/linear_layer_sparsity": 0.8412111300753073, + "compression/movement_sparsity/model_sparsity": 0.8123129398890913, + "compression_loss": 92.18675231933594, + "distillation_loss": 4.2238006591796875, + "epoch": 3.46, + "learning_rate": 3.632948248332864e-05, + "loss": 96.3399, + "step": 4094, + "task_loss": 1.5896131992340088 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8648110687665972, + "compression/movement_sparsity/importance_threshold": -0.0009468274249026824, + "compression/movement_sparsity/linear_layer_sparsity": 0.841260686916002, + "compression/movement_sparsity/model_sparsity": 0.8123607942998534, + "compression_loss": 92.21001434326172, + "distillation_loss": 3.8465700149536133, + "epoch": 3.46, + "learning_rate": 3.6324786324786323e-05, + "loss": 96.8738, + "step": 4095, + "task_loss": 3.6973462104797363 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8650334189378499, + "compression/movement_sparsity/importance_threshold": -0.0009452701432661383, + "compression/movement_sparsity/linear_layer_sparsity": 0.8413490092256808, + "compression/movement_sparsity/model_sparsity": 0.8124460824664835, + "compression_loss": 92.23323059082031, + "distillation_loss": 3.7955331802368164, + "epoch": 3.46, + "learning_rate": 3.6320090166244017e-05, + "loss": 96.2236, + "step": 4096, + "task_loss": 1.742809534072876 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8652555251703765, + "compression/movement_sparsity/importance_threshold": -0.0009437145701117434, + "compression/movement_sparsity/linear_layer_sparsity": 0.8414236068184108, + "compression/movement_sparsity/model_sparsity": 0.8125181174024142, + "compression_loss": 92.2564926147461, + "distillation_loss": 3.4154999256134033, + "epoch": 3.46, + "learning_rate": 3.63153940077017e-05, + "loss": 96.3124, + "step": 4097, + "task_loss": 2.207719564437866 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8654773875980619, + "compression/movement_sparsity/importance_threshold": -0.0009421607045018042, + "compression/movement_sparsity/linear_layer_sparsity": 0.8416875124965277, + "compression/movement_sparsity/model_sparsity": 0.812772957108619, + "compression_loss": 92.27965545654297, + "distillation_loss": 4.931546211242676, + "epoch": 3.46, + "learning_rate": 3.631069784915939e-05, + "loss": 96.589, + "step": 4098, + "task_loss": 2.900505781173706 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8656990063547915, + "compression/movement_sparsity/importance_threshold": -0.0009406085454986227, + "compression/movement_sparsity/linear_layer_sparsity": 0.8417996712173106, + "compression/movement_sparsity/model_sparsity": 0.8128812628323024, + "compression_loss": 92.30279541015625, + "distillation_loss": 6.492554187774658, + "epoch": 3.46, + "learning_rate": 3.6306001690617076e-05, + "loss": 96.9467, + "step": 4099, + "task_loss": 2.6742470264434814 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8659203815744502, + "compression/movement_sparsity/importance_threshold": -0.0009390580921645048, + "compression/movement_sparsity/linear_layer_sparsity": 0.8419991386935233, + "compression/movement_sparsity/model_sparsity": 0.8130738779870735, + "compression_loss": 92.32596588134766, + "distillation_loss": 3.8657307624816895, + "epoch": 3.47, + "learning_rate": 3.630130553207476e-05, + "loss": 96.578, + "step": 4100, + "task_loss": 1.6040565967559814 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8661415133909232, + "compression/movement_sparsity/importance_threshold": -0.0009375093435617533, + "compression/movement_sparsity/linear_layer_sparsity": 0.8421679014380737, + "compression/movement_sparsity/model_sparsity": 0.8132368432121737, + "compression_loss": 92.34902954101562, + "distillation_loss": 3.511505126953125, + "epoch": 3.47, + "learning_rate": 3.6296609373532455e-05, + "loss": 95.8788, + "step": 4101, + "task_loss": 2.006782054901123 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8663624019380959, + "compression/movement_sparsity/importance_threshold": -0.0009359622987526704, + "compression/movement_sparsity/linear_layer_sparsity": 0.8423449037824604, + "compression/movement_sparsity/model_sparsity": 0.8134077649815078, + "compression_loss": 92.3720703125, + "distillation_loss": 4.069624900817871, + "epoch": 3.47, + "learning_rate": 3.629191321499014e-05, + "loss": 96.4233, + "step": 4102, + "task_loss": 1.6975586414337158 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.866583047349853, + "compression/movement_sparsity/importance_threshold": -0.0009344169567995645, + "compression/movement_sparsity/linear_layer_sparsity": 0.8425303961342039, + "compression/movement_sparsity/model_sparsity": 0.8135868851003277, + "compression_loss": 92.39503479003906, + "distillation_loss": 3.04959774017334, + "epoch": 3.47, + "learning_rate": 3.628721705644783e-05, + "loss": 95.9314, + "step": 4103, + "task_loss": 1.9361587762832642 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8668034497600802, + "compression/movement_sparsity/importance_threshold": -0.0009328733167647341, + "compression/movement_sparsity/linear_layer_sparsity": 0.8426187303680504, + "compression/movement_sparsity/model_sparsity": 0.8136721847814936, + "compression_loss": 92.41804504394531, + "distillation_loss": 5.094153881072998, + "epoch": 3.47, + "learning_rate": 3.6282520897905514e-05, + "loss": 96.6316, + "step": 4104, + "task_loss": 3.217383861541748 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.867023609302662, + "compression/movement_sparsity/importance_threshold": -0.0009313313777104893, + "compression/movement_sparsity/linear_layer_sparsity": 0.8428014920854052, + "compression/movement_sparsity/model_sparsity": 0.8138486680716166, + "compression_loss": 92.4410171508789, + "distillation_loss": 3.4006552696228027, + "epoch": 3.47, + "learning_rate": 3.62778247393632e-05, + "loss": 96.2341, + "step": 4105, + "task_loss": 1.98316490650177 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.867243526111484, + "compression/movement_sparsity/importance_threshold": -0.0009297911386991287, + "compression/movement_sparsity/linear_layer_sparsity": 0.8429851123428299, + "compression/movement_sparsity/model_sparsity": 0.8140259804083168, + "compression_loss": 92.46395874023438, + "distillation_loss": 6.713625431060791, + "epoch": 3.47, + "learning_rate": 3.6273128580820893e-05, + "loss": 96.9213, + "step": 4106, + "task_loss": 3.0951333045959473 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8674632003204312, + "compression/movement_sparsity/importance_threshold": -0.0009282525987929589, + "compression/movement_sparsity/linear_layer_sparsity": 0.8431407704271485, + "compression/movement_sparsity/model_sparsity": 0.8141762911585787, + "compression_loss": 92.48689270019531, + "distillation_loss": 3.7216808795928955, + "epoch": 3.47, + "learning_rate": 3.626843242227857e-05, + "loss": 96.6517, + "step": 4107, + "task_loss": 2.667104482650757 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8676826320633888, + "compression/movement_sparsity/importance_threshold": -0.0009267157570542837, + "compression/movement_sparsity/linear_layer_sparsity": 0.8432869368740289, + "compression/movement_sparsity/model_sparsity": 0.8143174363383482, + "compression_loss": 92.50981140136719, + "distillation_loss": 4.892443656921387, + "epoch": 3.47, + "learning_rate": 3.6263736263736266e-05, + "loss": 96.9249, + "step": 4108, + "task_loss": 2.320315361022949 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8679018214742417, + "compression/movement_sparsity/importance_threshold": -0.0009251806125454079, + "compression/movement_sparsity/linear_layer_sparsity": 0.8434904347189025, + "compression/movement_sparsity/model_sparsity": 0.8145139434062177, + "compression_loss": 92.53267669677734, + "distillation_loss": 5.138246536254883, + "epoch": 3.47, + "learning_rate": 3.625904010519395e-05, + "loss": 96.3104, + "step": 4109, + "task_loss": 2.6896862983703613 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8681207686868754, + "compression/movement_sparsity/importance_threshold": -0.0009236471643286328, + "compression/movement_sparsity/linear_layer_sparsity": 0.8436679855750006, + "compression/movement_sparsity/model_sparsity": 0.8146853948441984, + "compression_loss": 92.55548858642578, + "distillation_loss": 5.047441482543945, + "epoch": 3.47, + "learning_rate": 3.625434394665164e-05, + "loss": 96.2501, + "step": 4110, + "task_loss": 2.9783132076263428 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8683394738351747, + "compression/movement_sparsity/importance_threshold": -0.000922115411466265, + "compression/movement_sparsity/linear_layer_sparsity": 0.8438888807804549, + "compression/movement_sparsity/model_sparsity": 0.8148987016197924, + "compression_loss": 92.57828521728516, + "distillation_loss": 2.9110918045043945, + "epoch": 3.47, + "learning_rate": 3.624964778810933e-05, + "loss": 96.7583, + "step": 4111, + "task_loss": 1.374081015586853 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8685579370530251, + "compression/movement_sparsity/importance_threshold": -0.0009205853530206065, + "compression/movement_sparsity/linear_layer_sparsity": 0.8440674332666344, + "compression/movement_sparsity/model_sparsity": 0.8150711202787798, + "compression_loss": 92.60106658935547, + "distillation_loss": 3.0939955711364746, + "epoch": 3.48, + "learning_rate": 3.624495162956701e-05, + "loss": 96.3629, + "step": 4112, + "task_loss": 1.7925692796707153 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8687761584743113, + "compression/movement_sparsity/importance_threshold": -0.000919056988053963, + "compression/movement_sparsity/linear_layer_sparsity": 0.8442820205874093, + "compression/movement_sparsity/model_sparsity": 0.8152783358649384, + "compression_loss": 92.62384796142578, + "distillation_loss": 3.8929269313812256, + "epoch": 3.48, + "learning_rate": 3.6240255471024705e-05, + "loss": 96.7755, + "step": 4113, + "task_loss": 2.8315958976745605 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8689941382329187, + "compression/movement_sparsity/importance_threshold": -0.0009175303156286384, + "compression/movement_sparsity/linear_layer_sparsity": 0.8443706648496143, + "compression/movement_sparsity/model_sparsity": 0.815363934924035, + "compression_loss": 92.64655303955078, + "distillation_loss": 4.712027549743652, + "epoch": 3.48, + "learning_rate": 3.623555931248239e-05, + "loss": 96.8788, + "step": 4114, + "task_loss": 2.8611810207366943 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8692118764627327, + "compression/movement_sparsity/importance_threshold": -0.000916005334806934, + "compression/movement_sparsity/linear_layer_sparsity": 0.844465998569863, + "compression/movement_sparsity/model_sparsity": 0.8154559936377124, + "compression_loss": 92.66927337646484, + "distillation_loss": 4.937551021575928, + "epoch": 3.48, + "learning_rate": 3.6230863153940084e-05, + "loss": 97.9486, + "step": 4115, + "task_loss": 2.6333107948303223 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8694293732976379, + "compression/movement_sparsity/importance_threshold": -0.0009144820446511572, + "compression/movement_sparsity/linear_layer_sparsity": 0.844653208001746, + "compression/movement_sparsity/model_sparsity": 0.8156367718496866, + "compression_loss": 92.6919937133789, + "distillation_loss": 3.503652572631836, + "epoch": 3.48, + "learning_rate": 3.6226166995397764e-05, + "loss": 96.9067, + "step": 4116, + "task_loss": 2.071707248687744 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8696466288715199, + "compression/movement_sparsity/importance_threshold": -0.0009129604442236101, + "compression/movement_sparsity/linear_layer_sparsity": 0.8447835391540061, + "compression/movement_sparsity/model_sparsity": 0.815762625725921, + "compression_loss": 92.71459197998047, + "distillation_loss": 4.400373935699463, + "epoch": 3.48, + "learning_rate": 3.622147083685546e-05, + "loss": 97.4509, + "step": 4117, + "task_loss": 2.686494827270508 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8698636433182634, + "compression/movement_sparsity/importance_threshold": -0.0009114405325865984, + "compression/movement_sparsity/linear_layer_sparsity": 0.8449163863056373, + "compression/movement_sparsity/model_sparsity": 0.8158909091692079, + "compression_loss": 92.73721313476562, + "distillation_loss": 5.112481117248535, + "epoch": 3.48, + "learning_rate": 3.621677467831314e-05, + "loss": 96.7751, + "step": 4118, + "task_loss": 2.508967638015747 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8700804167717541, + "compression/movement_sparsity/importance_threshold": -0.0009099223088024225, + "compression/movement_sparsity/linear_layer_sparsity": 0.8451095697455059, + "compression/movement_sparsity/model_sparsity": 0.8160774561636153, + "compression_loss": 92.75982666015625, + "distillation_loss": 3.9895613193511963, + "epoch": 3.48, + "learning_rate": 3.621207851977083e-05, + "loss": 96.9279, + "step": 4119, + "task_loss": 1.7788417339324951 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8702969493658765, + "compression/movement_sparsity/importance_threshold": -0.0009084057719333906, + "compression/movement_sparsity/linear_layer_sparsity": 0.8452769850591133, + "compression/movement_sparsity/model_sparsity": 0.8162391202461706, + "compression_loss": 92.78246307373047, + "distillation_loss": 4.171713352203369, + "epoch": 3.48, + "learning_rate": 3.6207382361228516e-05, + "loss": 96.4916, + "step": 4120, + "task_loss": 1.8571760654449463 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8705132412345163, + "compression/movement_sparsity/importance_threshold": -0.0009068909210418041, + "compression/movement_sparsity/linear_layer_sparsity": 0.8454021411226194, + "compression/movement_sparsity/model_sparsity": 0.8163599768138702, + "compression_loss": 92.80498504638672, + "distillation_loss": 3.7995357513427734, + "epoch": 3.48, + "learning_rate": 3.62026862026862e-05, + "loss": 96.4815, + "step": 4121, + "task_loss": 1.2836567163467407 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8707292925115583, + "compression/movement_sparsity/importance_threshold": -0.0009053777551899685, + "compression/movement_sparsity/linear_layer_sparsity": 0.8455747672974837, + "compression/movement_sparsity/model_sparsity": 0.8165266727485677, + "compression_loss": 92.82754516601562, + "distillation_loss": 4.409570217132568, + "epoch": 3.48, + "learning_rate": 3.6197990044143895e-05, + "loss": 97.4211, + "step": 4122, + "task_loss": 2.3507399559020996 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8709451033308877, + "compression/movement_sparsity/importance_threshold": -0.000903866273440186, + "compression/movement_sparsity/linear_layer_sparsity": 0.8457196459342595, + "compression/movement_sparsity/model_sparsity": 0.8166665743584713, + "compression_loss": 92.85001373291016, + "distillation_loss": 5.588755130767822, + "epoch": 3.48, + "learning_rate": 3.619329388560158e-05, + "loss": 97.0848, + "step": 4123, + "task_loss": 3.704510450363159 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8711606738263898, + "compression/movement_sparsity/importance_threshold": -0.0009023564748547614, + "compression/movement_sparsity/linear_layer_sparsity": 0.8458785950888456, + "compression/movement_sparsity/model_sparsity": 0.8168200631206126, + "compression_loss": 92.87252044677734, + "distillation_loss": 3.340946674346924, + "epoch": 3.49, + "learning_rate": 3.618859772705927e-05, + "loss": 96.258, + "step": 4124, + "task_loss": 2.989189863204956 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8713760041319496, + "compression/movement_sparsity/importance_threshold": -0.0009008483584959995, + "compression/movement_sparsity/linear_layer_sparsity": 0.8460552874048738, + "compression/movement_sparsity/model_sparsity": 0.8169906855120161, + "compression_loss": 92.89494323730469, + "distillation_loss": 4.447931289672852, + "epoch": 3.49, + "learning_rate": 3.6183901568516954e-05, + "loss": 97.5464, + "step": 4125, + "task_loss": 2.370572328567505 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8715910943814523, + "compression/movement_sparsity/importance_threshold": -0.0008993419234262024, + "compression/movement_sparsity/linear_layer_sparsity": 0.8462094549682379, + "compression/movement_sparsity/model_sparsity": 0.8171395569453037, + "compression_loss": 92.91735076904297, + "distillation_loss": 4.193358898162842, + "epoch": 3.49, + "learning_rate": 3.617920540997464e-05, + "loss": 97.6586, + "step": 4126, + "task_loss": 2.0952348709106445 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8718059447087829, + "compression/movement_sparsity/importance_threshold": -0.0008978371687076766, + "compression/movement_sparsity/linear_layer_sparsity": 0.8462842314234823, + "compression/movement_sparsity/model_sparsity": 0.8172117645992713, + "compression_loss": 92.93975830078125, + "distillation_loss": 3.5347540378570557, + "epoch": 3.49, + "learning_rate": 3.6174509251432334e-05, + "loss": 97.025, + "step": 4127, + "task_loss": 1.2136402130126953 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8720205552478268, + "compression/movement_sparsity/importance_threshold": -0.0008963340934027234, + "compression/movement_sparsity/linear_layer_sparsity": 0.8464045462749282, + "compression/movement_sparsity/model_sparsity": 0.8173279462654383, + "compression_loss": 92.96216583251953, + "distillation_loss": 4.028942108154297, + "epoch": 3.49, + "learning_rate": 3.616981309289002e-05, + "loss": 96.8976, + "step": 4128, + "task_loss": 2.8698887825012207 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8722349261324689, + "compression/movement_sparsity/importance_threshold": -0.0008948326965736494, + "compression/movement_sparsity/linear_layer_sparsity": 0.8465271505665601, + "compression/movement_sparsity/model_sparsity": 0.8174463387224776, + "compression_loss": 92.9845199584961, + "distillation_loss": 4.394811630249023, + "epoch": 3.49, + "learning_rate": 3.6165116934347706e-05, + "loss": 97.6738, + "step": 4129, + "task_loss": 2.5135157108306885 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8724490574965945, + "compression/movement_sparsity/importance_threshold": -0.0008933329772827566, + "compression/movement_sparsity/linear_layer_sparsity": 0.8465443809887939, + "compression/movement_sparsity/model_sparsity": 0.8174629772267008, + "compression_loss": 93.00684356689453, + "distillation_loss": 4.606848239898682, + "epoch": 3.49, + "learning_rate": 3.616042077580539e-05, + "loss": 96.7904, + "step": 4130, + "task_loss": 2.538403034210205 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8726629494740886, + "compression/movement_sparsity/importance_threshold": -0.0008918349345923499, + "compression/movement_sparsity/linear_layer_sparsity": 0.8467007425990031, + "compression/movement_sparsity/model_sparsity": 0.8176139673345746, + "compression_loss": 93.02912139892578, + "distillation_loss": 4.754396438598633, + "epoch": 3.49, + "learning_rate": 3.615572461726308e-05, + "loss": 96.513, + "step": 4131, + "task_loss": 3.5716421604156494 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8728766021988363, + "compression/movement_sparsity/importance_threshold": -0.000890338567564734, + "compression/movement_sparsity/linear_layer_sparsity": 0.846789160302023, + "compression/movement_sparsity/model_sparsity": 0.8176993476174911, + "compression_loss": 93.05137634277344, + "distillation_loss": 4.951827049255371, + "epoch": 3.49, + "learning_rate": 3.615102845872077e-05, + "loss": 97.137, + "step": 4132, + "task_loss": 2.5684168338775635 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8730900158047231, + "compression/movement_sparsity/importance_threshold": -0.0008888438752622102, + "compression/movement_sparsity/linear_layer_sparsity": 0.8470180446997934, + "compression/movement_sparsity/model_sparsity": 0.8179203691320673, + "compression_loss": 93.0736312866211, + "distillation_loss": 4.65708589553833, + "epoch": 3.49, + "learning_rate": 3.614633230017845e-05, + "loss": 97.591, + "step": 4133, + "task_loss": 2.4607903957366943 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8733031904256338, + "compression/movement_sparsity/importance_threshold": -0.000887350856747085, + "compression/movement_sparsity/linear_layer_sparsity": 0.847111446704885, + "compression/movement_sparsity/model_sparsity": 0.818010562490946, + "compression_loss": 93.0958023071289, + "distillation_loss": 4.14515495300293, + "epoch": 3.49, + "learning_rate": 3.6141636141636145e-05, + "loss": 97.2935, + "step": 4134, + "task_loss": 2.647430658340454 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8735161261954536, + "compression/movement_sparsity/importance_threshold": -0.0008858595110816615, + "compression/movement_sparsity/linear_layer_sparsity": 0.8472969271324609, + "compression/movement_sparsity/model_sparsity": 0.8181896710952301, + "compression_loss": 93.11800384521484, + "distillation_loss": 4.334054470062256, + "epoch": 3.5, + "learning_rate": 3.613693998309383e-05, + "loss": 97.3124, + "step": 4135, + "task_loss": 2.912120819091797 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8737288232480677, + "compression/movement_sparsity/importance_threshold": -0.0008843698373282435, + "compression/movement_sparsity/linear_layer_sparsity": 0.8474132235394135, + "compression/movement_sparsity/model_sparsity": 0.8183019723628342, + "compression_loss": 93.14007568359375, + "distillation_loss": 3.711839437484741, + "epoch": 3.5, + "learning_rate": 3.613224382455152e-05, + "loss": 96.926, + "step": 4136, + "task_loss": 2.7634055614471436 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8739412817173613, + "compression/movement_sparsity/importance_threshold": -0.000882881834549135, + "compression/movement_sparsity/linear_layer_sparsity": 0.8474635554510045, + "compression/movement_sparsity/model_sparsity": 0.818350575218423, + "compression_loss": 93.16217041015625, + "distillation_loss": 5.1326141357421875, + "epoch": 3.5, + "learning_rate": 3.6127547666009204e-05, + "loss": 96.9891, + "step": 4137, + "task_loss": 2.757244348526001 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8741535017372193, + "compression/movement_sparsity/importance_threshold": -0.0008813955018066415, + "compression/movement_sparsity/linear_layer_sparsity": 0.8475131957608726, + "compression/movement_sparsity/model_sparsity": 0.8183985102309357, + "compression_loss": 93.1842269897461, + "distillation_loss": 3.8709230422973633, + "epoch": 3.5, + "learning_rate": 3.612285150746689e-05, + "loss": 97.1981, + "step": 4138, + "task_loss": 2.748084783554077 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8743654834415271, + "compression/movement_sparsity/importance_threshold": -0.0008799108381630644, + "compression/movement_sparsity/linear_layer_sparsity": 0.8476391984402807, + "compression/movement_sparsity/model_sparsity": 0.8185201843306765, + "compression_loss": 93.20624542236328, + "distillation_loss": 5.369759559631348, + "epoch": 3.5, + "learning_rate": 3.611815534892458e-05, + "loss": 98.3797, + "step": 4139, + "task_loss": 3.713080644607544 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8745772269641696, + "compression/movement_sparsity/importance_threshold": -0.0008784278426807101, + "compression/movement_sparsity/linear_layer_sparsity": 0.8477183152925447, + "compression/movement_sparsity/model_sparsity": 0.8185965832756733, + "compression_loss": 93.22822570800781, + "distillation_loss": 5.3766350746154785, + "epoch": 3.5, + "learning_rate": 3.611345919038227e-05, + "loss": 98.1595, + "step": 4140, + "task_loss": 2.455111026763916 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8747887324390324, + "compression/movement_sparsity/importance_threshold": -0.0008769465144218792, + "compression/movement_sparsity/linear_layer_sparsity": 0.8478586627456188, + "compression/movement_sparsity/model_sparsity": 0.818732109361975, + "compression_loss": 93.25021362304688, + "distillation_loss": 3.059952974319458, + "epoch": 3.5, + "learning_rate": 3.6108763031839956e-05, + "loss": 97.2107, + "step": 4141, + "task_loss": 3.122684955596924 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.875, + "compression/movement_sparsity/importance_threshold": -0.0008754668524488807, + "compression/movement_sparsity/linear_layer_sparsity": 0.8480690289110506, + "compression/movement_sparsity/model_sparsity": 0.8189352488024624, + "compression_loss": 93.27220153808594, + "distillation_loss": 5.326725959777832, + "epoch": 3.5, + "learning_rate": 3.610406687329764e-05, + "loss": 97.7119, + "step": 4142, + "task_loss": 3.111431121826172 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.875211029780958, + "compression/movement_sparsity/importance_threshold": -0.0008739888558240141, + "compression/movement_sparsity/linear_layer_sparsity": 0.848217687508967, + "compression/movement_sparsity/model_sparsity": 0.8190788005202129, + "compression_loss": 93.29415893554688, + "distillation_loss": 4.338418483734131, + "epoch": 3.5, + "learning_rate": 3.609937071475533e-05, + "loss": 97.7256, + "step": 4143, + "task_loss": 1.840023398399353 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8754218219157913, + "compression/movement_sparsity/importance_threshold": -0.000872512523609586, + "compression/movement_sparsity/linear_layer_sparsity": 0.8482878254630011, + "compression/movement_sparsity/model_sparsity": 0.8191465290197564, + "compression_loss": 93.31608581542969, + "distillation_loss": 5.189243316650391, + "epoch": 3.5, + "learning_rate": 3.609467455621302e-05, + "loss": 97.6818, + "step": 4144, + "task_loss": 2.3773601055145264 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8756323765383853, + "compression/movement_sparsity/importance_threshold": -0.0008710378548678985, + "compression/movement_sparsity/linear_layer_sparsity": 0.8484066497934925, + "compression/movement_sparsity/model_sparsity": 0.819261271368949, + "compression_loss": 93.3379898071289, + "distillation_loss": 3.574031114578247, + "epoch": 3.5, + "learning_rate": 3.608997839767071e-05, + "loss": 97.1067, + "step": 4145, + "task_loss": 1.1576638221740723 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8758426937826249, + "compression/movement_sparsity/importance_threshold": -0.0008695648486612573, + "compression/movement_sparsity/linear_layer_sparsity": 0.8484710045262233, + "compression/movement_sparsity/model_sparsity": 0.8193234153186322, + "compression_loss": 93.35987854003906, + "distillation_loss": 4.426519870758057, + "epoch": 3.5, + "learning_rate": 3.6085282239128394e-05, + "loss": 97.8168, + "step": 4146, + "task_loss": 2.5952112674713135 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8760527737823953, + "compression/movement_sparsity/importance_threshold": -0.0008680935040519663, + "compression/movement_sparsity/linear_layer_sparsity": 0.8486858899511891, + "compression/movement_sparsity/model_sparsity": 0.8195309187681855, + "compression_loss": 93.3817138671875, + "distillation_loss": 4.929561614990234, + "epoch": 3.51, + "learning_rate": 3.608058608058608e-05, + "loss": 97.4987, + "step": 4147, + "task_loss": 3.311614513397217 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8762626166715816, + "compression/movement_sparsity/importance_threshold": -0.0008666238201023285, + "compression/movement_sparsity/linear_layer_sparsity": 0.84888980514193, + "compression/movement_sparsity/model_sparsity": 0.819727828844808, + "compression_loss": 93.40352630615234, + "distillation_loss": 3.5207104682922363, + "epoch": 3.51, + "learning_rate": 3.6075889922043774e-05, + "loss": 96.882, + "step": 4148, + "task_loss": 2.434013843536377 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8764722225840692, + "compression/movement_sparsity/importance_threshold": -0.0008651557958746468, + "compression/movement_sparsity/linear_layer_sparsity": 0.8490061253972179, + "compression/movement_sparsity/model_sparsity": 0.8198401531414837, + "compression_loss": 93.42529296875, + "distillation_loss": 3.7370734214782715, + "epoch": 3.51, + "learning_rate": 3.607119376350146e-05, + "loss": 97.4195, + "step": 4149, + "task_loss": 1.7413933277130127 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8766815916537428, + "compression/movement_sparsity/importance_threshold": -0.0008636894304312288, + "compression/movement_sparsity/linear_layer_sparsity": 0.8492498673078626, + "compression/movement_sparsity/model_sparsity": 0.82007552176766, + "compression_loss": 93.447021484375, + "distillation_loss": 4.307043075561523, + "epoch": 3.51, + "learning_rate": 3.606649760495914e-05, + "loss": 97.6733, + "step": 4150, + "task_loss": 3.040567636489868 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.876890724014488, + "compression/movement_sparsity/importance_threshold": -0.0008622247228343739, + "compression/movement_sparsity/linear_layer_sparsity": 0.8494278474339955, + "compression/movement_sparsity/model_sparsity": 0.8202473877289292, + "compression_loss": 93.46873474121094, + "distillation_loss": 4.055239677429199, + "epoch": 3.51, + "learning_rate": 3.606180144641683e-05, + "loss": 98.0497, + "step": 4151, + "task_loss": 2.5396182537078857 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8770996198001896, + "compression/movement_sparsity/importance_threshold": -0.0008607616721463904, + "compression/movement_sparsity/linear_layer_sparsity": 0.8496192541728863, + "compression/movement_sparsity/model_sparsity": 0.8204322190575032, + "compression_loss": 93.49040985107422, + "distillation_loss": 3.8153319358825684, + "epoch": 3.51, + "learning_rate": 3.605710528787452e-05, + "loss": 97.5929, + "step": 4152, + "task_loss": 1.693247675895691 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.877308279144733, + "compression/movement_sparsity/importance_threshold": -0.0008593002774295788, + "compression/movement_sparsity/linear_layer_sparsity": 0.8497208361569762, + "compression/movement_sparsity/model_sparsity": 0.820530311387937, + "compression_loss": 93.51212310791016, + "distillation_loss": 3.8544321060180664, + "epoch": 3.51, + "learning_rate": 3.605240912933221e-05, + "loss": 97.5761, + "step": 4153, + "task_loss": 2.557344436645508 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8775167021820031, + "compression/movement_sparsity/importance_threshold": -0.0008578405377462454, + "compression/movement_sparsity/linear_layer_sparsity": 0.849757813000815, + "compression/movement_sparsity/model_sparsity": 0.8205660179634358, + "compression_loss": 93.53378295898438, + "distillation_loss": 4.353232383728027, + "epoch": 3.51, + "learning_rate": 3.604771297078989e-05, + "loss": 97.8664, + "step": 4154, + "task_loss": 2.2120399475097656 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8777248890458851, + "compression/movement_sparsity/importance_threshold": -0.0008563824521586934, + "compression/movement_sparsity/linear_layer_sparsity": 0.849799893388402, + "compression/movement_sparsity/model_sparsity": 0.8206066527602547, + "compression_loss": 93.55540466308594, + "distillation_loss": 4.8854475021362305, + "epoch": 3.51, + "learning_rate": 3.6043016812247585e-05, + "loss": 98.12, + "step": 4155, + "task_loss": 2.84133243560791 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8779328398702643, + "compression/movement_sparsity/importance_threshold": -0.0008549260197292258, + "compression/movement_sparsity/linear_layer_sparsity": 0.8500345133108053, + "compression/movement_sparsity/model_sparsity": 0.8208332127665481, + "compression_loss": 93.57708740234375, + "distillation_loss": 3.9159774780273438, + "epoch": 3.51, + "learning_rate": 3.603832065370527e-05, + "loss": 98.1309, + "step": 4156, + "task_loss": 1.5043833255767822 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8781405547890256, + "compression/movement_sparsity/importance_threshold": -0.0008534712395201483, + "compression/movement_sparsity/linear_layer_sparsity": 0.8502012727951928, + "compression/movement_sparsity/model_sparsity": 0.8209942435496348, + "compression_loss": 93.5986099243164, + "distillation_loss": 4.348298072814941, + "epoch": 3.51, + "learning_rate": 3.603362449516296e-05, + "loss": 97.9904, + "step": 4157, + "task_loss": 1.9731087684631348 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8783480339360544, + "compression/movement_sparsity/importance_threshold": -0.0008520181105937637, + "compression/movement_sparsity/linear_layer_sparsity": 0.8503316993407939, + "compression/movement_sparsity/model_sparsity": 0.8211201895421555, + "compression_loss": 93.62019348144531, + "distillation_loss": 4.849423408508301, + "epoch": 3.51, + "learning_rate": 3.602892833662065e-05, + "loss": 98.4102, + "step": 4158, + "task_loss": 3.974299192428589 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8785552774452356, + "compression/movement_sparsity/importance_threshold": -0.000850566632012377, + "compression/movement_sparsity/linear_layer_sparsity": 0.8504595025695151, + "compression/movement_sparsity/model_sparsity": 0.8212436023368014, + "compression_loss": 93.64170837402344, + "distillation_loss": 5.258194923400879, + "epoch": 3.52, + "learning_rate": 3.602423217807833e-05, + "loss": 97.5949, + "step": 4159, + "task_loss": 2.7566659450531006 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8787622854504545, + "compression/movement_sparsity/importance_threshold": -0.0008491168028382911, + "compression/movement_sparsity/linear_layer_sparsity": 0.850643504400304, + "compression/movement_sparsity/model_sparsity": 0.8214212831386469, + "compression_loss": 93.66324615478516, + "distillation_loss": 6.0293073654174805, + "epoch": 3.52, + "learning_rate": 3.6019536019536024e-05, + "loss": 98.5599, + "step": 4160, + "task_loss": 2.3827083110809326 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8789690580855962, + "compression/movement_sparsity/importance_threshold": -0.0008476686221338108, + "compression/movement_sparsity/linear_layer_sparsity": 0.8507477097012739, + "compression/movement_sparsity/model_sparsity": 0.8215219086669554, + "compression_loss": 93.68476104736328, + "distillation_loss": 4.878064155578613, + "epoch": 3.52, + "learning_rate": 3.601483986099371e-05, + "loss": 97.8712, + "step": 4161, + "task_loss": 2.801372766494751 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8791755954845459, + "compression/movement_sparsity/importance_threshold": -0.0008462220889612392, + "compression/movement_sparsity/linear_layer_sparsity": 0.8508898696278286, + "compression/movement_sparsity/model_sparsity": 0.821659184962698, + "compression_loss": 93.70625305175781, + "distillation_loss": 5.138028144836426, + "epoch": 3.52, + "learning_rate": 3.6010143702451396e-05, + "loss": 98.4596, + "step": 4162, + "task_loss": 3.43042254447937 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8793818977811886, + "compression/movement_sparsity/importance_threshold": -0.0008447772023828818, + "compression/movement_sparsity/linear_layer_sparsity": 0.8510083481574585, + "compression/movement_sparsity/model_sparsity": 0.8217735933903525, + "compression_loss": 93.72772979736328, + "distillation_loss": 3.8158538341522217, + "epoch": 3.52, + "learning_rate": 3.600544754390908e-05, + "loss": 98.5376, + "step": 4163, + "task_loss": 1.2900984287261963 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8795879651094095, + "compression/movement_sparsity/importance_threshold": -0.00084333396146104, + "compression/movement_sparsity/linear_layer_sparsity": 0.8512126210732285, + "compression/movement_sparsity/model_sparsity": 0.8219708489030487, + "compression_loss": 93.7491226196289, + "distillation_loss": 5.257659435272217, + "epoch": 3.52, + "learning_rate": 3.600075138536677e-05, + "loss": 98.4433, + "step": 4164, + "task_loss": 2.4934074878692627 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8797937976030938, + "compression/movement_sparsity/importance_threshold": -0.0008418923652580203, + "compression/movement_sparsity/linear_layer_sparsity": 0.8513078236276332, + "compression/movement_sparsity/model_sparsity": 0.8220627809568324, + "compression_loss": 93.77058410644531, + "distillation_loss": 5.180685043334961, + "epoch": 3.52, + "learning_rate": 3.599605522682446e-05, + "loss": 97.656, + "step": 4165, + "task_loss": 3.353649854660034 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8799993953961265, + "compression/movement_sparsity/importance_threshold": -0.0008404524128361266, + "compression/movement_sparsity/linear_layer_sparsity": 0.8514576865664808, + "compression/movement_sparsity/model_sparsity": 0.8222074956426981, + "compression_loss": 93.79193878173828, + "distillation_loss": 3.1469130516052246, + "epoch": 3.52, + "learning_rate": 3.599135906828215e-05, + "loss": 98.4429, + "step": 4166, + "task_loss": 1.9907824993133545 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.880204758622393, + "compression/movement_sparsity/importance_threshold": -0.0008390141032576601, + "compression/movement_sparsity/linear_layer_sparsity": 0.8516011700756431, + "compression/movement_sparsity/model_sparsity": 0.8223460500519139, + "compression_loss": 93.81331634521484, + "distillation_loss": 4.858263969421387, + "epoch": 3.52, + "learning_rate": 3.5986662909739835e-05, + "loss": 98.811, + "step": 4167, + "task_loss": 3.7204430103302 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8804098874157782, + "compression/movement_sparsity/importance_threshold": -0.0008375774355849274, + "compression/movement_sparsity/linear_layer_sparsity": 0.8518151254155334, + "compression/movement_sparsity/model_sparsity": 0.8225526553676753, + "compression_loss": 93.8345947265625, + "distillation_loss": 4.608565330505371, + "epoch": 3.52, + "learning_rate": 3.598196675119752e-05, + "loss": 98.5426, + "step": 4168, + "task_loss": 2.3603477478027344 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8806147819101673, + "compression/movement_sparsity/importance_threshold": -0.0008361424088802324, + "compression/movement_sparsity/linear_layer_sparsity": 0.8519441210610181, + "compression/movement_sparsity/model_sparsity": 0.8226772196159007, + "compression_loss": 93.85592651367188, + "distillation_loss": 5.692918300628662, + "epoch": 3.52, + "learning_rate": 3.597727059265521e-05, + "loss": 98.8037, + "step": 4169, + "task_loss": 2.694190263748169 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8808194422394454, + "compression/movement_sparsity/importance_threshold": -0.0008347090222058781, + "compression/movement_sparsity/linear_layer_sparsity": 0.8519806567106545, + "compression/movement_sparsity/model_sparsity": 0.8227125001535751, + "compression_loss": 93.877197265625, + "distillation_loss": 5.2529296875, + "epoch": 3.52, + "learning_rate": 3.59725744341129e-05, + "loss": 98.4049, + "step": 4170, + "task_loss": 2.8716211318969727 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8810238685374978, + "compression/movement_sparsity/importance_threshold": -0.0008332772746241683, + "compression/movement_sparsity/linear_layer_sparsity": 0.8520976447193299, + "compression/movement_sparsity/model_sparsity": 0.8228254692642554, + "compression_loss": 93.8984603881836, + "distillation_loss": 6.019613742828369, + "epoch": 3.53, + "learning_rate": 3.596787827557058e-05, + "loss": 98.0078, + "step": 4171, + "task_loss": 3.4348418712615967 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8812280609382095, + "compression/movement_sparsity/importance_threshold": -0.000831847165197408, + "compression/movement_sparsity/linear_layer_sparsity": 0.8522051887872377, + "compression/movement_sparsity/model_sparsity": 0.8229293188625864, + "compression_loss": 93.9196548461914, + "distillation_loss": 4.286190986633301, + "epoch": 3.53, + "learning_rate": 3.596318211702827e-05, + "loss": 98.1081, + "step": 4172, + "task_loss": 2.0071964263916016 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8814320195754657, + "compression/movement_sparsity/importance_threshold": -0.0008304186929879, + "compression/movement_sparsity/linear_layer_sparsity": 0.8523320023100451, + "compression/movement_sparsity/model_sparsity": 0.8230517759507613, + "compression_loss": 93.9407730102539, + "distillation_loss": 4.196881294250488, + "epoch": 3.53, + "learning_rate": 3.595848595848596e-05, + "loss": 98.1584, + "step": 4173, + "task_loss": 2.8888401985168457 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8816357445831514, + "compression/movement_sparsity/importance_threshold": -0.0008289918570579501, + "compression/movement_sparsity/linear_layer_sparsity": 0.8523904188072933, + "compression/movement_sparsity/model_sparsity": 0.8231081856616187, + "compression_loss": 93.96197509765625, + "distillation_loss": 5.669647216796875, + "epoch": 3.53, + "learning_rate": 3.5953789799943646e-05, + "loss": 97.9186, + "step": 4174, + "task_loss": 4.2195820808410645 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8818392360951519, + "compression/movement_sparsity/importance_threshold": -0.0008275666564698612, + "compression/movement_sparsity/linear_layer_sparsity": 0.8526316566427344, + "compression/movement_sparsity/model_sparsity": 0.823341136235278, + "compression_loss": 93.98297882080078, + "distillation_loss": 4.817070960998535, + "epoch": 3.53, + "learning_rate": 3.594909364140134e-05, + "loss": 98.4848, + "step": 4175, + "task_loss": 1.8361786603927612 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8820424942453525, + "compression/movement_sparsity/importance_threshold": -0.0008261430902859355, + "compression/movement_sparsity/linear_layer_sparsity": 0.8528830538690013, + "compression/movement_sparsity/model_sparsity": 0.8235838971934344, + "compression_loss": 94.0041275024414, + "distillation_loss": 4.981739044189453, + "epoch": 3.53, + "learning_rate": 3.594439748285902e-05, + "loss": 98.4853, + "step": 4176, + "task_loss": 2.181946039199829 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8822455191676379, + "compression/movement_sparsity/importance_threshold": -0.0008247211575684804, + "compression/movement_sparsity/linear_layer_sparsity": 0.8530718730635152, + "compression/movement_sparsity/model_sparsity": 0.823766229867741, + "compression_loss": 94.025146484375, + "distillation_loss": 4.7256975173950195, + "epoch": 3.53, + "learning_rate": 3.593970132431671e-05, + "loss": 98.6515, + "step": 4177, + "task_loss": 3.0115597248077393 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8824483109958936, + "compression/movement_sparsity/importance_threshold": -0.0008233008573797981, + "compression/movement_sparsity/linear_layer_sparsity": 0.8532088459771483, + "compression/movement_sparsity/model_sparsity": 0.8238984973404129, + "compression_loss": 94.0461654663086, + "distillation_loss": 5.346207618713379, + "epoch": 3.53, + "learning_rate": 3.59350051657744e-05, + "loss": 98.5219, + "step": 4178, + "task_loss": 2.447176933288574 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8826508698640047, + "compression/movement_sparsity/importance_threshold": -0.0008218821887821916, + "compression/movement_sparsity/linear_layer_sparsity": 0.853339677944449, + "compression/movement_sparsity/model_sparsity": 0.8240248348271505, + "compression_loss": 94.06723022460938, + "distillation_loss": 5.215678691864014, + "epoch": 3.53, + "learning_rate": 3.593030900723209e-05, + "loss": 98.2285, + "step": 4179, + "task_loss": 3.0503692626953125 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8828531959058562, + "compression/movement_sparsity/importance_threshold": -0.0008204651508379656, + "compression/movement_sparsity/linear_layer_sparsity": 0.853439554772567, + "compression/movement_sparsity/model_sparsity": 0.8241212805789656, + "compression_loss": 94.08818817138672, + "distillation_loss": 4.184694290161133, + "epoch": 3.53, + "learning_rate": 3.592561284868977e-05, + "loss": 98.3332, + "step": 4180, + "task_loss": 2.027510166168213 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8830552892553332, + "compression/movement_sparsity/importance_threshold": -0.0008190497426094259, + "compression/movement_sparsity/linear_layer_sparsity": 0.8535405643966104, + "compression/movement_sparsity/model_sparsity": 0.8242188202116811, + "compression_loss": 94.10914611816406, + "distillation_loss": 3.8027215003967285, + "epoch": 3.53, + "learning_rate": 3.592091669014746e-05, + "loss": 98.9346, + "step": 4181, + "task_loss": 2.356431722640991 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.883257150046321, + "compression/movement_sparsity/importance_threshold": -0.0008176359631588744, + "compression/movement_sparsity/linear_layer_sparsity": 0.8536569085002336, + "compression/movement_sparsity/model_sparsity": 0.8243311675374285, + "compression_loss": 94.13006591796875, + "distillation_loss": 4.215858459472656, + "epoch": 3.53, + "learning_rate": 3.591622053160515e-05, + "loss": 98.9113, + "step": 4182, + "task_loss": 2.6620218753814697 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8834587784127047, + "compression/movement_sparsity/importance_threshold": -0.0008162238115486152, + "compression/movement_sparsity/linear_layer_sparsity": 0.8537399484036496, + "compression/movement_sparsity/model_sparsity": 0.8244113547647016, + "compression_loss": 94.15107727050781, + "distillation_loss": 5.0990400314331055, + "epoch": 3.54, + "learning_rate": 3.5911524373062836e-05, + "loss": 98.8428, + "step": 4183, + "task_loss": 2.691236734390259 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8836601744883694, + "compression/movement_sparsity/importance_threshold": -0.0008148132868409539, + "compression/movement_sparsity/linear_layer_sparsity": 0.8538972997197726, + "compression/movement_sparsity/model_sparsity": 0.8245633005790465, + "compression_loss": 94.17195892333984, + "distillation_loss": 4.923058986663818, + "epoch": 3.54, + "learning_rate": 3.590682821452052e-05, + "loss": 97.9176, + "step": 4184, + "task_loss": 2.138990879058838 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8838613384072003, + "compression/movement_sparsity/importance_threshold": -0.0008134043880981927, + "compression/movement_sparsity/linear_layer_sparsity": 0.8539647189635857, + "compression/movement_sparsity/model_sparsity": 0.8246284037644288, + "compression_loss": 94.19287872314453, + "distillation_loss": 3.6656653881073, + "epoch": 3.54, + "learning_rate": 3.590213205597821e-05, + "loss": 97.9746, + "step": 4185, + "task_loss": 1.2535301446914673 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8840622703030825, + "compression/movement_sparsity/importance_threshold": -0.0008119971143826362, + "compression/movement_sparsity/linear_layer_sparsity": 0.8541374643801264, + "compression/movement_sparsity/model_sparsity": 0.8247952148444843, + "compression_loss": 94.21368408203125, + "distillation_loss": 4.167150497436523, + "epoch": 3.54, + "learning_rate": 3.58974358974359e-05, + "loss": 97.861, + "step": 4186, + "task_loss": 2.6228694915771484 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8842629703099012, + "compression/movement_sparsity/importance_threshold": -0.0008105914647565876, + "compression/movement_sparsity/linear_layer_sparsity": 0.8543069187263996, + "compression/movement_sparsity/model_sparsity": 0.8249588479126605, + "compression_loss": 94.23454284667969, + "distillation_loss": 4.144308090209961, + "epoch": 3.54, + "learning_rate": 3.589273973889359e-05, + "loss": 98.4715, + "step": 4187, + "task_loss": 2.6289780139923096 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8844634385615413, + "compression/movement_sparsity/importance_threshold": -0.0008091874382823534, + "compression/movement_sparsity/linear_layer_sparsity": 0.8544358905235492, + "compression/movement_sparsity/model_sparsity": 0.8250833891318142, + "compression_loss": 94.25535583496094, + "distillation_loss": 4.427104949951172, + "epoch": 3.54, + "learning_rate": 3.5888043580351275e-05, + "loss": 98.0066, + "step": 4188, + "task_loss": 3.208298444747925 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8846636751918884, + "compression/movement_sparsity/importance_threshold": -0.0008077850340222339, + "compression/movement_sparsity/linear_layer_sparsity": 0.8546387086908676, + "compression/movement_sparsity/model_sparsity": 0.8252792398711436, + "compression_loss": 94.27613067626953, + "distillation_loss": 4.748185157775879, + "epoch": 3.54, + "learning_rate": 3.588334742180896e-05, + "loss": 98.2163, + "step": 4189, + "task_loss": 2.598127841949463 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8848636803348271, + "compression/movement_sparsity/importance_threshold": -0.0008063842510385366, + "compression/movement_sparsity/linear_layer_sparsity": 0.8548470000511308, + "compression/movement_sparsity/model_sparsity": 0.8254803757824026, + "compression_loss": 94.29689025878906, + "distillation_loss": 5.485854148864746, + "epoch": 3.54, + "learning_rate": 3.587865126326665e-05, + "loss": 98.3996, + "step": 4190, + "task_loss": 1.9777425527572632 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.885063454124243, + "compression/movement_sparsity/importance_threshold": -0.0008049850883935627, + "compression/movement_sparsity/linear_layer_sparsity": 0.854957250905092, + "compression/movement_sparsity/model_sparsity": 0.8255868391803589, + "compression_loss": 94.317626953125, + "distillation_loss": 3.7345404624938965, + "epoch": 3.54, + "learning_rate": 3.587395510472434e-05, + "loss": 98.1652, + "step": 4191, + "task_loss": 2.169113874435425 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.885262996694021, + "compression/movement_sparsity/importance_threshold": -0.0008035875451496179, + "compression/movement_sparsity/linear_layer_sparsity": 0.8551452711803743, + "compression/movement_sparsity/model_sparsity": 0.8257684003807673, + "compression_loss": 94.33834838867188, + "distillation_loss": 3.2240233421325684, + "epoch": 3.54, + "learning_rate": 3.586925894618203e-05, + "loss": 98.0911, + "step": 4192, + "task_loss": 2.38122296333313 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8854623081780463, + "compression/movement_sparsity/importance_threshold": -0.000802191620369006, + "compression/movement_sparsity/linear_layer_sparsity": 0.8553374291418262, + "compression/movement_sparsity/model_sparsity": 0.8259539571250963, + "compression_loss": 94.35904693603516, + "distillation_loss": 4.507116794586182, + "epoch": 3.54, + "learning_rate": 3.5864562787639713e-05, + "loss": 98.7274, + "step": 4193, + "task_loss": 3.1533265113830566 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8856613887102041, + "compression/movement_sparsity/importance_threshold": -0.0008007973131140293, + "compression/movement_sparsity/linear_layer_sparsity": 0.8555357638221136, + "compression/movement_sparsity/model_sparsity": 0.8261454783989669, + "compression_loss": 94.3796615600586, + "distillation_loss": 5.630313396453857, + "epoch": 3.54, + "learning_rate": 3.58598666290974e-05, + "loss": 98.6595, + "step": 4194, + "task_loss": 3.042914867401123 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8858602384243793, + "compression/movement_sparsity/importance_threshold": -0.0007994046224469942, + "compression/movement_sparsity/linear_layer_sparsity": 0.8557258946750674, + "compression/movement_sparsity/model_sparsity": 0.8263290776722109, + "compression_loss": 94.4003677368164, + "distillation_loss": 5.783313751220703, + "epoch": 3.55, + "learning_rate": 3.5855170470555086e-05, + "loss": 99.3368, + "step": 4195, + "task_loss": 3.7847111225128174 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8860588574544573, + "compression/movement_sparsity/importance_threshold": -0.0007980135474302038, + "compression/movement_sparsity/linear_layer_sparsity": 0.8559378348306272, + "compression/movement_sparsity/model_sparsity": 0.826533737031423, + "compression_loss": 94.4209976196289, + "distillation_loss": 4.880232334136963, + "epoch": 3.55, + "learning_rate": 3.585047431201278e-05, + "loss": 98.4293, + "step": 4196, + "task_loss": 1.8543449640274048 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8862572459343231, + "compression/movement_sparsity/importance_threshold": -0.0007966240871259619, + "compression/movement_sparsity/linear_layer_sparsity": 0.8561505142845804, + "compression/movement_sparsity/model_sparsity": 0.8267391102918544, + "compression_loss": 94.44161224365234, + "distillation_loss": 3.903545618057251, + "epoch": 3.55, + "learning_rate": 3.584577815347046e-05, + "loss": 98.8785, + "step": 4197, + "task_loss": 2.3983712196350098 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.886455403997862, + "compression/movement_sparsity/importance_threshold": -0.0007952362405965717, + "compression/movement_sparsity/linear_layer_sparsity": 0.8561998684144253, + "compression/movement_sparsity/model_sparsity": 0.8267867689555081, + "compression_loss": 94.46214294433594, + "distillation_loss": 4.0785112380981445, + "epoch": 3.55, + "learning_rate": 3.584108199492815e-05, + "loss": 98.8763, + "step": 4198, + "task_loss": 2.148146867752075 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.886653331778959, + "compression/movement_sparsity/importance_threshold": -0.0007938500069043387, + "compression/movement_sparsity/linear_layer_sparsity": 0.8562818709152571, + "compression/movement_sparsity/model_sparsity": 0.8268659544181671, + "compression_loss": 94.48271942138672, + "distillation_loss": 4.398580074310303, + "epoch": 3.55, + "learning_rate": 3.583638583638584e-05, + "loss": 98.0878, + "step": 4199, + "task_loss": 2.474411964416504 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8868510294114992, + "compression/movement_sparsity/importance_threshold": -0.0007924653851115659, + "compression/movement_sparsity/linear_layer_sparsity": 0.8564375647720786, + "compression/movement_sparsity/model_sparsity": 0.8270162997120364, + "compression_loss": 94.50326538085938, + "distillation_loss": 3.9863524436950684, + "epoch": 3.55, + "learning_rate": 3.5831689677843525e-05, + "loss": 98.4826, + "step": 4200, + "task_loss": 2.3416385650634766 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8870484970293679, + "compression/movement_sparsity/importance_threshold": -0.0007910823742805573, + "compression/movement_sparsity/linear_layer_sparsity": 0.856601808257095, + "compression/movement_sparsity/model_sparsity": 0.8271749009280704, + "compression_loss": 94.52374267578125, + "distillation_loss": 3.6895673274993896, + "epoch": 3.55, + "learning_rate": 3.582699351930121e-05, + "loss": 98.1414, + "step": 4201, + "task_loss": 1.4110249280929565 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8872457347664502, + "compression/movement_sparsity/importance_threshold": -0.0007897009734736168, + "compression/movement_sparsity/linear_layer_sparsity": 0.8567363009438599, + "compression/movement_sparsity/model_sparsity": 0.827304773377297, + "compression_loss": 94.5442123413086, + "distillation_loss": 4.26611852645874, + "epoch": 3.55, + "learning_rate": 3.58222973607589e-05, + "loss": 99.3701, + "step": 4202, + "task_loss": 2.0326406955718994 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8874427427566312, + "compression/movement_sparsity/importance_threshold": -0.0007883211817530491, + "compression/movement_sparsity/linear_layer_sparsity": 0.8569686075777417, + "compression/movement_sparsity/model_sparsity": 0.8275290995636463, + "compression_loss": 94.56466674804688, + "distillation_loss": 3.3899190425872803, + "epoch": 3.55, + "learning_rate": 3.581760120221659e-05, + "loss": 98.6473, + "step": 4203, + "task_loss": 1.7740589380264282 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8876395211337959, + "compression/movement_sparsity/importance_threshold": -0.0007869429981811581, + "compression/movement_sparsity/linear_layer_sparsity": 0.8569487299902928, + "compression/movement_sparsity/model_sparsity": 0.8275099048324768, + "compression_loss": 94.58500671386719, + "distillation_loss": 3.7976932525634766, + "epoch": 3.55, + "learning_rate": 3.581290504367428e-05, + "loss": 99.2411, + "step": 4204, + "task_loss": 1.7195155620574951 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8878360700318296, + "compression/movement_sparsity/importance_threshold": -0.000785566421820246, + "compression/movement_sparsity/linear_layer_sparsity": 0.8570022456546426, + "compression/movement_sparsity/model_sparsity": 0.8275615820691227, + "compression_loss": 94.60545349121094, + "distillation_loss": 3.3278534412384033, + "epoch": 3.55, + "learning_rate": 3.580820888513196e-05, + "loss": 98.5774, + "step": 4205, + "task_loss": 1.694261074066162 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8880323895846174, + "compression/movement_sparsity/importance_threshold": -0.0007841914517326201, + "compression/movement_sparsity/linear_layer_sparsity": 0.857163996988623, + "compression/movement_sparsity/model_sparsity": 0.8277177767471756, + "compression_loss": 94.62581634521484, + "distillation_loss": 6.120212554931641, + "epoch": 3.56, + "learning_rate": 3.580351272658965e-05, + "loss": 99.4717, + "step": 4206, + "task_loss": 3.3743984699249268 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8882284799260447, + "compression/movement_sparsity/importance_threshold": -0.0007828180869805801, + "compression/movement_sparsity/linear_layer_sparsity": 0.8572585437138077, + "compression/movement_sparsity/model_sparsity": 0.8278090755014905, + "compression_loss": 94.6461410522461, + "distillation_loss": 3.7408790588378906, + "epoch": 3.56, + "learning_rate": 3.5798816568047336e-05, + "loss": 98.6247, + "step": 4207, + "task_loss": 2.323953866958618 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8884243411899961, + "compression/movement_sparsity/importance_threshold": -0.0007814463266264341, + "compression/movement_sparsity/linear_layer_sparsity": 0.8574244327339579, + "compression/movement_sparsity/model_sparsity": 0.8279692657234642, + "compression_loss": 94.66641235351562, + "distillation_loss": 4.504405975341797, + "epoch": 3.56, + "learning_rate": 3.579412040950503e-05, + "loss": 98.7681, + "step": 4208, + "task_loss": 2.0915000438690186 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8886199735103572, + "compression/movement_sparsity/importance_threshold": -0.0007800761697324844, + "compression/movement_sparsity/linear_layer_sparsity": 0.8576295999623006, + "compression/movement_sparsity/model_sparsity": 0.8281673848263451, + "compression_loss": 94.68672180175781, + "distillation_loss": 4.816781997680664, + "epoch": 3.56, + "learning_rate": 3.5789424250962715e-05, + "loss": 99.2691, + "step": 4209, + "task_loss": 3.3535447120666504 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.888815377021013, + "compression/movement_sparsity/importance_threshold": -0.0007787076153610338, + "compression/movement_sparsity/linear_layer_sparsity": 0.8577477803877396, + "compression/movement_sparsity/model_sparsity": 0.8282815053906047, + "compression_loss": 94.70698547363281, + "distillation_loss": 4.467840671539307, + "epoch": 3.56, + "learning_rate": 3.57847280924204e-05, + "loss": 99.0582, + "step": 4210, + "task_loss": 1.770585536956787 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8890105518558485, + "compression/movement_sparsity/importance_threshold": -0.0007773406625743882, + "compression/movement_sparsity/linear_layer_sparsity": 0.8577616481947001, + "compression/movement_sparsity/model_sparsity": 0.8282948967957339, + "compression_loss": 94.7271957397461, + "distillation_loss": 4.68893575668335, + "epoch": 3.56, + "learning_rate": 3.578003193387809e-05, + "loss": 99.6146, + "step": 4211, + "task_loss": 3.1243414878845215 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.889205498148749, + "compression/movement_sparsity/importance_threshold": -0.0007759753104348505, + "compression/movement_sparsity/linear_layer_sparsity": 0.8576759015052307, + "compression/movement_sparsity/model_sparsity": 0.8282120957688353, + "compression_loss": 94.74744415283203, + "distillation_loss": 4.043570041656494, + "epoch": 3.56, + "learning_rate": 3.577533577533578e-05, + "loss": 98.8245, + "step": 4212, + "task_loss": 1.9759941101074219 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8894002160335998, + "compression/movement_sparsity/importance_threshold": -0.0007746115580047246, + "compression/movement_sparsity/linear_layer_sparsity": 0.8578332885938563, + "compression/movement_sparsity/model_sparsity": 0.8283640761267874, + "compression_loss": 94.76761627197266, + "distillation_loss": 3.9328396320343018, + "epoch": 3.56, + "learning_rate": 3.577063961679347e-05, + "loss": 98.1057, + "step": 4213, + "task_loss": 1.4132288694381714 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8895947056442857, + "compression/movement_sparsity/importance_threshold": -0.0007732494043463153, + "compression/movement_sparsity/linear_layer_sparsity": 0.8578660204340168, + "compression/movement_sparsity/model_sparsity": 0.8283956835275434, + "compression_loss": 94.78777313232422, + "distillation_loss": 3.49436092376709, + "epoch": 3.56, + "learning_rate": 3.576594345825115e-05, + "loss": 98.2827, + "step": 4214, + "task_loss": 1.6415598392486572 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8897889671146918, + "compression/movement_sparsity/importance_threshold": -0.0007718888485219264, + "compression/movement_sparsity/linear_layer_sparsity": 0.8580190552014555, + "compression/movement_sparsity/model_sparsity": 0.8285434610799306, + "compression_loss": 94.80792236328125, + "distillation_loss": 6.025250434875488, + "epoch": 3.56, + "learning_rate": 3.576124729970884e-05, + "loss": 100.0529, + "step": 4215, + "task_loss": 3.3867642879486084 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8899830005787037, + "compression/movement_sparsity/importance_threshold": -0.0007705298895938602, + "compression/movement_sparsity/linear_layer_sparsity": 0.8581049330567689, + "compression/movement_sparsity/model_sparsity": 0.8286263887667229, + "compression_loss": 94.8280258178711, + "distillation_loss": 4.967032432556152, + "epoch": 3.56, + "learning_rate": 3.5756551141166526e-05, + "loss": 98.7806, + "step": 4216, + "task_loss": 2.105295419692993 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8901768061702062, + "compression/movement_sparsity/importance_threshold": -0.0007691725266244231, + "compression/movement_sparsity/linear_layer_sparsity": 0.8582708697735896, + "compression/movement_sparsity/model_sparsity": 0.8287866250468396, + "compression_loss": 94.84805297851562, + "distillation_loss": 3.506471633911133, + "epoch": 3.56, + "learning_rate": 3.575185498262422e-05, + "loss": 98.3469, + "step": 4217, + "task_loss": 2.086794376373291 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8903703840230844, + "compression/movement_sparsity/importance_threshold": -0.0007678167586759183, + "compression/movement_sparsity/linear_layer_sparsity": 0.8584046708586317, + "compression/movement_sparsity/model_sparsity": 0.8289158296529903, + "compression_loss": 94.86821746826172, + "distillation_loss": 5.5784807205200195, + "epoch": 3.57, + "learning_rate": 3.57471588240819e-05, + "loss": 99.3624, + "step": 4218, + "task_loss": 2.660674810409546 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8905637342712236, + "compression/movement_sparsity/importance_threshold": -0.0007664625848106494, + "compression/movement_sparsity/linear_layer_sparsity": 0.8584049331903196, + "compression/movement_sparsity/model_sparsity": 0.8289160829727777, + "compression_loss": 94.88831329345703, + "distillation_loss": 3.4981958866119385, + "epoch": 3.57, + "learning_rate": 3.574246266553959e-05, + "loss": 98.4617, + "step": 4219, + "task_loss": 2.036100387573242 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.890756857048509, + "compression/movement_sparsity/importance_threshold": -0.0007651100040909197, + "compression/movement_sparsity/linear_layer_sparsity": 0.8584708142165077, + "compression/movement_sparsity/model_sparsity": 0.8289797007830425, + "compression_loss": 94.90825653076172, + "distillation_loss": 4.332541465759277, + "epoch": 3.57, + "learning_rate": 3.573776650699728e-05, + "loss": 99.2355, + "step": 4220, + "task_loss": 3.7314934730529785 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8909497524888255, + "compression/movement_sparsity/importance_threshold": -0.0007637590155790347, + "compression/movement_sparsity/linear_layer_sparsity": 0.8586050564957524, + "compression/movement_sparsity/model_sparsity": 0.8291093314270175, + "compression_loss": 94.92828369140625, + "distillation_loss": 3.470811367034912, + "epoch": 3.57, + "learning_rate": 3.5733070348454965e-05, + "loss": 98.4727, + "step": 4221, + "task_loss": 1.0225696563720703 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8911424207260584, + "compression/movement_sparsity/importance_threshold": -0.0007624096183372975, + "compression/movement_sparsity/linear_layer_sparsity": 0.8586960498189816, + "compression/movement_sparsity/model_sparsity": 0.8291971988496656, + "compression_loss": 94.9482650756836, + "distillation_loss": 4.0635986328125, + "epoch": 3.57, + "learning_rate": 3.572837418991266e-05, + "loss": 98.9561, + "step": 4222, + "task_loss": 3.2986013889312744 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8913348618940928, + "compression/movement_sparsity/importance_threshold": -0.000761061811428012, + "compression/movement_sparsity/linear_layer_sparsity": 0.8587614896509672, + "compression/movement_sparsity/model_sparsity": 0.8292603906221061, + "compression_loss": 94.9681396484375, + "distillation_loss": 3.7003707885742188, + "epoch": 3.57, + "learning_rate": 3.572367803137034e-05, + "loss": 98.576, + "step": 4223, + "task_loss": 1.8450802564620972 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8915270761268139, + "compression/movement_sparsity/importance_threshold": -0.000759715593913482, + "compression/movement_sparsity/linear_layer_sparsity": 0.8589075845528418, + "compression/movement_sparsity/model_sparsity": 0.8294014667146608, + "compression_loss": 94.98812866210938, + "distillation_loss": 4.547817707061768, + "epoch": 3.57, + "learning_rate": 3.571898187282803e-05, + "loss": 99.5571, + "step": 4224, + "task_loss": 4.161828994750977 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8917190635581068, + "compression/movement_sparsity/importance_threshold": -0.0007583709648560123, + "compression/movement_sparsity/linear_layer_sparsity": 0.8590175969234503, + "compression/movement_sparsity/model_sparsity": 0.8295076998219012, + "compression_loss": 95.00800323486328, + "distillation_loss": 5.438022136688232, + "epoch": 3.57, + "learning_rate": 3.571428571428572e-05, + "loss": 99.3561, + "step": 4225, + "task_loss": 2.476797103881836 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8919108243218565, + "compression/movement_sparsity/importance_threshold": -0.0007570279233179069, + "compression/movement_sparsity/linear_layer_sparsity": 0.8591231258070277, + "compression/movement_sparsity/model_sparsity": 0.829609603463683, + "compression_loss": 95.02791595458984, + "distillation_loss": 2.6253061294555664, + "epoch": 3.57, + "learning_rate": 3.57095895557434e-05, + "loss": 99.2022, + "step": 4226, + "task_loss": 1.1413633823394775 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8921023585519484, + "compression/movement_sparsity/importance_threshold": -0.0007556864683614687, + "compression/movement_sparsity/linear_layer_sparsity": 0.8592623881608469, + "compression/movement_sparsity/model_sparsity": 0.8297440817272274, + "compression_loss": 95.04777526855469, + "distillation_loss": 2.835491418838501, + "epoch": 3.57, + "learning_rate": 3.570489339720109e-05, + "loss": 98.7802, + "step": 4227, + "task_loss": 2.6953446865081787 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8922936663822675, + "compression/movement_sparsity/importance_threshold": -0.0007543465990490026, + "compression/movement_sparsity/linear_layer_sparsity": 0.8593862206417453, + "compression/movement_sparsity/model_sparsity": 0.8298636601814536, + "compression_loss": 95.06761932373047, + "distillation_loss": 3.5182652473449707, + "epoch": 3.57, + "learning_rate": 3.5700197238658776e-05, + "loss": 98.9473, + "step": 4228, + "task_loss": 2.361135959625244 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.892484747946699, + "compression/movement_sparsity/importance_threshold": -0.0007530083144428124, + "compression/movement_sparsity/linear_layer_sparsity": 0.8595354754480434, + "compression/movement_sparsity/model_sparsity": 0.8300077876259939, + "compression_loss": 95.0874252319336, + "distillation_loss": 4.198307037353516, + "epoch": 3.57, + "learning_rate": 3.569550108011647e-05, + "loss": 99.7338, + "step": 4229, + "task_loss": 2.732495069503784 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8926756033791279, + "compression/movement_sparsity/importance_threshold": -0.000751671613605202, + "compression/movement_sparsity/linear_layer_sparsity": 0.8595954420870842, + "compression/movement_sparsity/model_sparsity": 0.8300656942265047, + "compression_loss": 95.10716247558594, + "distillation_loss": 6.82596492767334, + "epoch": 3.58, + "learning_rate": 3.5690804921574155e-05, + "loss": 101.299, + "step": 4230, + "task_loss": 3.593867540359497 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8928662328134394, + "compression/movement_sparsity/importance_threshold": -0.0007503364955984753, + "compression/movement_sparsity/linear_layer_sparsity": 0.8597991426428077, + "compression/movement_sparsity/model_sparsity": 0.8302623970414829, + "compression_loss": 95.126953125, + "distillation_loss": 3.7819485664367676, + "epoch": 3.58, + "learning_rate": 3.568610876303184e-05, + "loss": 99.0461, + "step": 4231, + "task_loss": 1.9397151470184326 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8930566363835187, + "compression/movement_sparsity/importance_threshold": -0.0007490029594849363, + "compression/movement_sparsity/linear_layer_sparsity": 0.8599317036144155, + "compression/movement_sparsity/model_sparsity": 0.8303904041359107, + "compression_loss": 95.14668273925781, + "distillation_loss": 4.32335090637207, + "epoch": 3.58, + "learning_rate": 3.568141260448953e-05, + "loss": 99.2191, + "step": 4232, + "task_loss": 2.7963733673095703 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.893246814223251, + "compression/movement_sparsity/importance_threshold": -0.0007476710043268879, + "compression/movement_sparsity/linear_layer_sparsity": 0.8601192707713277, + "compression/movement_sparsity/model_sparsity": 0.8305715277839588, + "compression_loss": 95.16647338867188, + "distillation_loss": 3.908141613006592, + "epoch": 3.58, + "learning_rate": 3.5676716445947214e-05, + "loss": 98.8688, + "step": 4233, + "task_loss": 1.5860579013824463 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8934367664665213, + "compression/movement_sparsity/importance_threshold": -0.0007463406291866375, + "compression/movement_sparsity/linear_layer_sparsity": 0.8603196921809513, + "compression/movement_sparsity/model_sparsity": 0.8307650641015935, + "compression_loss": 95.18608093261719, + "distillation_loss": 3.044175148010254, + "epoch": 3.58, + "learning_rate": 3.567202028740491e-05, + "loss": 98.6616, + "step": 4234, + "task_loss": 2.2728664875030518 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8936264932472149, + "compression/movement_sparsity/importance_threshold": -0.0007450118331264848, + "compression/movement_sparsity/linear_layer_sparsity": 0.8605508063980696, + "compression/movement_sparsity/model_sparsity": 0.8309882388343632, + "compression_loss": 95.20569610595703, + "distillation_loss": 3.559412956237793, + "epoch": 3.58, + "learning_rate": 3.5667324128862594e-05, + "loss": 98.8096, + "step": 4235, + "task_loss": 1.5963290929794312 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8938159946992167, + "compression/movement_sparsity/importance_threshold": -0.000743684615208736, + "compression/movement_sparsity/linear_layer_sparsity": 0.8607068460557524, + "compression/movement_sparsity/model_sparsity": 0.8311389180497707, + "compression_loss": 95.22530364990234, + "distillation_loss": 4.1290998458862305, + "epoch": 3.58, + "learning_rate": 3.566262797032028e-05, + "loss": 99.0606, + "step": 4236, + "task_loss": 2.407137632369995 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8940052709564119, + "compression/movement_sparsity/importance_threshold": -0.0007423589744956962, + "compression/movement_sparsity/linear_layer_sparsity": 0.8607349632430378, + "compression/movement_sparsity/model_sparsity": 0.8311660693251741, + "compression_loss": 95.24494934082031, + "distillation_loss": 4.583278179168701, + "epoch": 3.58, + "learning_rate": 3.5657931811777967e-05, + "loss": 99.8632, + "step": 4237, + "task_loss": 2.260117769241333 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.894194322152686, + "compression/movement_sparsity/importance_threshold": -0.0007410349100496656, + "compression/movement_sparsity/linear_layer_sparsity": 0.8609341445392272, + "compression/movement_sparsity/model_sparsity": 0.8313584081310862, + "compression_loss": 95.26454162597656, + "distillation_loss": 3.7170071601867676, + "epoch": 3.58, + "learning_rate": 3.565323565323565e-05, + "loss": 99.9107, + "step": 4238, + "task_loss": 1.7686738967895508 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8943831484219236, + "compression/movement_sparsity/importance_threshold": -0.0007397124209329534, + "compression/movement_sparsity/linear_layer_sparsity": 0.8610274153784749, + "compression/movement_sparsity/model_sparsity": 0.8314484748300711, + "compression_loss": 95.28411865234375, + "distillation_loss": 4.712100982666016, + "epoch": 3.58, + "learning_rate": 3.5648539494693346e-05, + "loss": 99.8103, + "step": 4239, + "task_loss": 2.466038942337036 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8945717498980102, + "compression/movement_sparsity/importance_threshold": -0.0007383915062078592, + "compression/movement_sparsity/linear_layer_sparsity": 0.861216950023047, + "compression/movement_sparsity/model_sparsity": 0.8316314983765253, + "compression_loss": 95.3037109375, + "distillation_loss": 5.715935707092285, + "epoch": 3.58, + "learning_rate": 3.5643843336151026e-05, + "loss": 100.0788, + "step": 4240, + "task_loss": 3.6584253311157227 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8947601267148309, + "compression/movement_sparsity/importance_threshold": -0.0007370721649366885, + "compression/movement_sparsity/linear_layer_sparsity": 0.861314370472632, + "compression/movement_sparsity/model_sparsity": 0.8317255721339667, + "compression_loss": 95.3232192993164, + "distillation_loss": 5.758817672729492, + "epoch": 3.58, + "learning_rate": 3.563914717760872e-05, + "loss": 100.1407, + "step": 4241, + "task_loss": 2.5723631381988525 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8949482790062707, + "compression/movement_sparsity/importance_threshold": -0.0007357543961817453, + "compression/movement_sparsity/linear_layer_sparsity": 0.861451677262959, + "compression/movement_sparsity/model_sparsity": 0.8318581620136408, + "compression_loss": 95.34275817871094, + "distillation_loss": 4.0753703117370605, + "epoch": 3.59, + "learning_rate": 3.5634451019066405e-05, + "loss": 99.5099, + "step": 4242, + "task_loss": 2.9933478832244873 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8951362069062149, + "compression/movement_sparsity/importance_threshold": -0.0007344381990053335, + "compression/movement_sparsity/linear_layer_sparsity": 0.8616367880413381, + "compression/movement_sparsity/model_sparsity": 0.8320369136673152, + "compression_loss": 95.36219787597656, + "distillation_loss": 4.497125625610352, + "epoch": 3.59, + "learning_rate": 3.56297548605241e-05, + "loss": 98.8599, + "step": 4243, + "task_loss": 2.4475936889648438 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8953239105485484, + "compression/movement_sparsity/importance_threshold": -0.0007331235724697587, + "compression/movement_sparsity/linear_layer_sparsity": 0.8617707799130623, + "compression/movement_sparsity/model_sparsity": 0.8321663025060385, + "compression_loss": 95.38167572021484, + "distillation_loss": 4.050002574920654, + "epoch": 3.59, + "learning_rate": 3.562505870198178e-05, + "loss": 99.796, + "step": 4244, + "task_loss": 3.2542877197265625 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8955113900671567, + "compression/movement_sparsity/importance_threshold": -0.0007318105156373213, + "compression/movement_sparsity/linear_layer_sparsity": 0.8618737331764303, + "compression/movement_sparsity/model_sparsity": 0.8322657190080887, + "compression_loss": 95.4010238647461, + "distillation_loss": 4.273817539215088, + "epoch": 3.59, + "learning_rate": 3.5620362543439464e-05, + "loss": 99.6139, + "step": 4245, + "task_loss": 1.3939132690429688 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8956986455959246, + "compression/movement_sparsity/importance_threshold": -0.0007304990275703287, + "compression/movement_sparsity/linear_layer_sparsity": 0.8620145695203776, + "compression/movement_sparsity/model_sparsity": 0.8324017171903579, + "compression_loss": 95.42040252685547, + "distillation_loss": 3.3786368370056152, + "epoch": 3.59, + "learning_rate": 3.561566638489716e-05, + "loss": 99.016, + "step": 4246, + "task_loss": 1.950330138206482 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8958856772687375, + "compression/movement_sparsity/importance_threshold": -0.0007291891073310831, + "compression/movement_sparsity/linear_layer_sparsity": 0.8620702673074047, + "compression/movement_sparsity/model_sparsity": 0.8324555015870543, + "compression_loss": 95.43971252441406, + "distillation_loss": 5.5003581047058105, + "epoch": 3.59, + "learning_rate": 3.5610970226354844e-05, + "loss": 99.241, + "step": 4247, + "task_loss": 2.521172046661377 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8960724852194804, + "compression/movement_sparsity/importance_threshold": -0.0007278807539818884, + "compression/movement_sparsity/linear_layer_sparsity": 0.8622166364651349, + "compression/movement_sparsity/model_sparsity": 0.8325968425139322, + "compression_loss": 95.45901489257812, + "distillation_loss": 4.9145660400390625, + "epoch": 3.59, + "learning_rate": 3.560627406781253e-05, + "loss": 99.5785, + "step": 4248, + "task_loss": 2.9931321144104004 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8962590695820384, + "compression/movement_sparsity/importance_threshold": -0.0007265739665850493, + "compression/movement_sparsity/linear_layer_sparsity": 0.8623422933436816, + "compression/movement_sparsity/model_sparsity": 0.832718182692135, + "compression_loss": 95.47824096679688, + "distillation_loss": 4.132366180419922, + "epoch": 3.59, + "learning_rate": 3.5601577909270216e-05, + "loss": 99.7276, + "step": 4249, + "task_loss": 3.036210775375366 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8964454304902967, + "compression/movement_sparsity/importance_threshold": -0.0007252687442028706, + "compression/movement_sparsity/linear_layer_sparsity": 0.8623711856018633, + "compression/movement_sparsity/model_sparsity": 0.8327460824123653, + "compression_loss": 95.4974594116211, + "distillation_loss": 3.266859531402588, + "epoch": 3.59, + "learning_rate": 3.559688175072791e-05, + "loss": 100.1496, + "step": 4250, + "task_loss": 1.7120857238769531 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8966315680781406, + "compression/movement_sparsity/importance_threshold": -0.0007239650858976536, + "compression/movement_sparsity/linear_layer_sparsity": 0.862508110818826, + "compression/movement_sparsity/model_sparsity": 0.832878303826894, + "compression_loss": 95.51670837402344, + "distillation_loss": 4.251181125640869, + "epoch": 3.59, + "learning_rate": 3.5592185592185596e-05, + "loss": 99.7999, + "step": 4251, + "task_loss": 2.348019599914551 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8968174824794549, + "compression/movement_sparsity/importance_threshold": -0.0007226629907317048, + "compression/movement_sparsity/linear_layer_sparsity": 0.8627675449340794, + "compression/movement_sparsity/model_sparsity": 0.8331288255821758, + "compression_loss": 95.53594970703125, + "distillation_loss": 4.091492652893066, + "epoch": 3.59, + "learning_rate": 3.558748943364328e-05, + "loss": 100.3227, + "step": 4252, + "task_loss": 2.108844757080078 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.897003173828125, + "compression/movement_sparsity/importance_threshold": -0.0007213624577673272, + "compression/movement_sparsity/linear_layer_sparsity": 0.8629095856189578, + "compression/movement_sparsity/model_sparsity": 0.8332659867325604, + "compression_loss": 95.55511474609375, + "distillation_loss": 4.5245771408081055, + "epoch": 3.59, + "learning_rate": 3.558279327510097e-05, + "loss": 100.1262, + "step": 4253, + "task_loss": 4.600433349609375 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.897188642258036, + "compression/movement_sparsity/importance_threshold": -0.0007200634860668248, + "compression/movement_sparsity/linear_layer_sparsity": 0.8630874226550791, + "compression/movement_sparsity/model_sparsity": 0.8334377145194002, + "compression_loss": 95.57428741455078, + "distillation_loss": 3.55635404586792, + "epoch": 3.6, + "learning_rate": 3.5578097116558655e-05, + "loss": 99.884, + "step": 4254, + "task_loss": 1.478051781654358 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8973738879030729, + "compression/movement_sparsity/importance_threshold": -0.0007187660746925022, + "compression/movement_sparsity/linear_layer_sparsity": 0.8632548975895248, + "compression/movement_sparsity/model_sparsity": 0.8335994361746345, + "compression_loss": 95.59339904785156, + "distillation_loss": 4.404010772705078, + "epoch": 3.6, + "learning_rate": 3.557340095801635e-05, + "loss": 100.5491, + "step": 4255, + "task_loss": 1.9814642667770386 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8975589108971211, + "compression/movement_sparsity/importance_threshold": -0.0007174702227066617, + "compression/movement_sparsity/linear_layer_sparsity": 0.8633225195441877, + "compression/movement_sparsity/model_sparsity": 0.8336647351071254, + "compression_loss": 95.61253356933594, + "distillation_loss": 4.804727077484131, + "epoch": 3.6, + "learning_rate": 3.5568704799474034e-05, + "loss": 99.8506, + "step": 4256, + "task_loss": 3.141603708267212 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8977437113740656, + "compression/movement_sparsity/importance_threshold": -0.000716175929171608, + "compression/movement_sparsity/linear_layer_sparsity": 0.8633619408423919, + "compression/movement_sparsity/model_sparsity": 0.8337028021624621, + "compression_loss": 95.63155364990234, + "distillation_loss": 4.1111602783203125, + "epoch": 3.6, + "learning_rate": 3.556400864093172e-05, + "loss": 99.8709, + "step": 4257, + "task_loss": 2.270627737045288 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8979282894677915, + "compression/movement_sparsity/importance_threshold": -0.0007148831931496468, + "compression/movement_sparsity/linear_layer_sparsity": 0.8634873950100888, + "compression/movement_sparsity/model_sparsity": 0.8338239465935564, + "compression_loss": 95.65061950683594, + "distillation_loss": 4.5870585441589355, + "epoch": 3.6, + "learning_rate": 3.555931248238941e-05, + "loss": 99.6647, + "step": 4258, + "task_loss": 2.215709924697876 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8981126453121838, + "compression/movement_sparsity/importance_threshold": -0.0007135920137030811, + "compression/movement_sparsity/linear_layer_sparsity": 0.8636375441289595, + "compression/movement_sparsity/model_sparsity": 0.8339689376282813, + "compression_loss": 95.66960906982422, + "distillation_loss": 3.4400198459625244, + "epoch": 3.6, + "learning_rate": 3.555461632384709e-05, + "loss": 99.6717, + "step": 4259, + "task_loss": 1.9292771816253662 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.898296779041128, + "compression/movement_sparsity/importance_threshold": -0.000712302389894213, + "compression/movement_sparsity/linear_layer_sparsity": 0.8637799902355376, + "compression/movement_sparsity/model_sparsity": 0.8341064902728829, + "compression_loss": 95.68865203857422, + "distillation_loss": 3.7589597702026367, + "epoch": 3.6, + "learning_rate": 3.5549920165304786e-05, + "loss": 100.173, + "step": 4260, + "task_loss": 2.6841092109680176 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.898480690788509, + "compression/movement_sparsity/importance_threshold": -0.0007110143207853491, + "compression/movement_sparsity/linear_layer_sparsity": 0.8638688968294306, + "compression/movement_sparsity/model_sparsity": 0.834192342651767, + "compression_loss": 95.70760345458984, + "distillation_loss": 3.5339272022247314, + "epoch": 3.6, + "learning_rate": 3.5545224006762466e-05, + "loss": 99.4745, + "step": 4261, + "task_loss": 2.308706283569336 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.898664380688212, + "compression/movement_sparsity/importance_threshold": -0.0007097278054387923, + "compression/movement_sparsity/linear_layer_sparsity": 0.8639742110779906, + "compression/movement_sparsity/model_sparsity": 0.8342940390319044, + "compression_loss": 95.72660827636719, + "distillation_loss": 4.77067232131958, + "epoch": 3.6, + "learning_rate": 3.554052784822016e-05, + "loss": 100.0226, + "step": 4262, + "task_loss": 2.272444725036621 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8988478488741221, + "compression/movement_sparsity/importance_threshold": -0.0007084428429168467, + "compression/movement_sparsity/linear_layer_sparsity": 0.864079739961568, + "compression/movement_sparsity/model_sparsity": 0.834395942673686, + "compression_loss": 95.74553680419922, + "distillation_loss": 4.050790309906006, + "epoch": 3.6, + "learning_rate": 3.5535831689677845e-05, + "loss": 99.8489, + "step": 4263, + "task_loss": 2.708509683609009 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8990310954801245, + "compression/movement_sparsity/importance_threshold": -0.000707159432281816, + "compression/movement_sparsity/linear_layer_sparsity": 0.864218286865329, + "compression/movement_sparsity/model_sparsity": 0.8345297300650828, + "compression_loss": 95.76445770263672, + "distillation_loss": 3.9858670234680176, + "epoch": 3.6, + "learning_rate": 3.553113553113553e-05, + "loss": 100.1182, + "step": 4264, + "task_loss": 1.1736524105072021 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8992141206401043, + "compression/movement_sparsity/importance_threshold": -0.0007058775725960042, + "compression/movement_sparsity/linear_layer_sparsity": 0.8643814452510905, + "compression/movement_sparsity/model_sparsity": 0.8346872834583595, + "compression_loss": 95.78329467773438, + "distillation_loss": 4.853404521942139, + "epoch": 3.6, + "learning_rate": 3.5526439372593225e-05, + "loss": 100.4709, + "step": 4265, + "task_loss": 2.4195873737335205 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8993969244879467, + "compression/movement_sparsity/importance_threshold": -0.0007045972629217152, + "compression/movement_sparsity/linear_layer_sparsity": 0.8645093796456557, + "compression/movement_sparsity/model_sparsity": 0.8348108229128991, + "compression_loss": 95.80215454101562, + "distillation_loss": 4.108983039855957, + "epoch": 3.61, + "learning_rate": 3.5521743214050904e-05, + "loss": 100.4064, + "step": 4266, + "task_loss": 1.4528888463974 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8995795071575368, + "compression/movement_sparsity/importance_threshold": -0.0007033185023212528, + "compression/movement_sparsity/linear_layer_sparsity": 0.864595329045975, + "compression/movement_sparsity/model_sparsity": 0.8348938196869062, + "compression_loss": 95.82109069824219, + "distillation_loss": 4.723228454589844, + "epoch": 3.61, + "learning_rate": 3.55170470555086e-05, + "loss": 100.7143, + "step": 4267, + "task_loss": 3.076694965362549 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8997618687827597, + "compression/movement_sparsity/importance_threshold": -0.0007020412898569219, + "compression/movement_sparsity/linear_layer_sparsity": 0.8648336454603454, + "compression/movement_sparsity/model_sparsity": 0.8351239491992959, + "compression_loss": 95.83995819091797, + "distillation_loss": 3.680413246154785, + "epoch": 3.61, + "learning_rate": 3.5512350896966284e-05, + "loss": 99.8117, + "step": 4268, + "task_loss": 2.0536088943481445 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.8999440094975006, + "compression/movement_sparsity/importance_threshold": -0.0007007656245910255, + "compression/movement_sparsity/linear_layer_sparsity": 0.8648962473404337, + "compression/movement_sparsity/model_sparsity": 0.8351844005122172, + "compression_loss": 95.85875701904297, + "distillation_loss": 5.204758644104004, + "epoch": 3.61, + "learning_rate": 3.550765473842397e-05, + "loss": 100.0336, + "step": 4269, + "task_loss": 2.403553009033203 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9001259294356445, + "compression/movement_sparsity/importance_threshold": -0.00069949150558587, + "compression/movement_sparsity/linear_layer_sparsity": 0.8650124006573746, + "compression/movement_sparsity/model_sparsity": 0.8352965636053918, + "compression_loss": 95.87752532958984, + "distillation_loss": 4.010772705078125, + "epoch": 3.61, + "learning_rate": 3.5502958579881656e-05, + "loss": 100.0025, + "step": 4270, + "task_loss": 1.8188772201538086 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9003076287310768, + "compression/movement_sparsity/importance_threshold": -0.0006982189319037551, + "compression/movement_sparsity/linear_layer_sparsity": 0.8651424933262819, + "compression/movement_sparsity/model_sparsity": 0.8354221871909103, + "compression_loss": 95.89631652832031, + "distillation_loss": 3.156705856323242, + "epoch": 3.61, + "learning_rate": 3.549826242133934e-05, + "loss": 100.1325, + "step": 4271, + "task_loss": 2.598294258117676 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9004891075176825, + "compression/movement_sparsity/importance_threshold": -0.0006969479026069881, + "compression/movement_sparsity/linear_layer_sparsity": 0.8652795497090885, + "compression/movement_sparsity/model_sparsity": 0.8355545352653327, + "compression_loss": 95.91511535644531, + "distillation_loss": 4.084255218505859, + "epoch": 3.61, + "learning_rate": 3.5493566262797036e-05, + "loss": 99.616, + "step": 4272, + "task_loss": 3.53688383102417 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9006703659293467, + "compression/movement_sparsity/importance_threshold": -0.0006956784167578721, + "compression/movement_sparsity/linear_layer_sparsity": 0.8653527164017021, + "compression/movement_sparsity/model_sparsity": 0.835625188456968, + "compression_loss": 95.933837890625, + "distillation_loss": 4.723835468292236, + "epoch": 3.61, + "learning_rate": 3.548887010425472e-05, + "loss": 100.3172, + "step": 4273, + "task_loss": 2.9156343936920166 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9008514040999546, + "compression/movement_sparsity/importance_threshold": -0.00069441047341871, + "compression/movement_sparsity/linear_layer_sparsity": 0.8653602643998156, + "compression/movement_sparsity/model_sparsity": 0.8356324771581259, + "compression_loss": 95.95258331298828, + "distillation_loss": 4.618986129760742, + "epoch": 3.61, + "learning_rate": 3.548417394571241e-05, + "loss": 100.2825, + "step": 4274, + "task_loss": 2.5440514087677 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9010322221633913, + "compression/movement_sparsity/importance_threshold": -0.0006931440716518075, + "compression/movement_sparsity/linear_layer_sparsity": 0.8655083864101883, + "compression/movement_sparsity/model_sparsity": 0.8357755107217658, + "compression_loss": 95.97126007080078, + "distillation_loss": 3.871387004852295, + "epoch": 3.61, + "learning_rate": 3.5479477787170095e-05, + "loss": 99.8351, + "step": 4275, + "task_loss": 2.873246908187866 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.901212820253542, + "compression/movement_sparsity/importance_threshold": -0.0006918792105194676, + "compression/movement_sparsity/linear_layer_sparsity": 0.8656134860237308, + "compression/movement_sparsity/model_sparsity": 0.8358769998402589, + "compression_loss": 95.99002075195312, + "distillation_loss": 3.659684896469116, + "epoch": 3.61, + "learning_rate": 3.547478162862778e-05, + "loss": 99.8265, + "step": 4276, + "task_loss": 1.4133249521255493 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9013931985042917, + "compression/movement_sparsity/importance_threshold": -0.0006906158890839951, + "compression/movement_sparsity/linear_layer_sparsity": 0.8657292100706369, + "compression/movement_sparsity/model_sparsity": 0.8359887484101448, + "compression_loss": 96.0086669921875, + "distillation_loss": 5.459488868713379, + "epoch": 3.61, + "learning_rate": 3.5470085470085474e-05, + "loss": 99.9944, + "step": 4277, + "task_loss": 2.44586181640625 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9015733570495258, + "compression/movement_sparsity/importance_threshold": -0.0006893541064076913, + "compression/movement_sparsity/linear_layer_sparsity": 0.865974967165612, + "compression/movement_sparsity/model_sparsity": 0.8362260629928704, + "compression_loss": 96.02733612060547, + "distillation_loss": 3.229130744934082, + "epoch": 3.62, + "learning_rate": 3.5465389311543154e-05, + "loss": 100.0239, + "step": 4278, + "task_loss": 2.019221782684326 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9017532960231291, + "compression/movement_sparsity/importance_threshold": -0.0006880938615528645, + "compression/movement_sparsity/linear_layer_sparsity": 0.8660275765932214, + "compression/movement_sparsity/model_sparsity": 0.8362768651247958, + "compression_loss": 96.04586791992188, + "distillation_loss": 3.4382500648498535, + "epoch": 3.62, + "learning_rate": 3.546069315300085e-05, + "loss": 99.7238, + "step": 4279, + "task_loss": 2.159698009490967 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9019330155589871, + "compression/movement_sparsity/importance_threshold": -0.000686835153581815, + "compression/movement_sparsity/linear_layer_sparsity": 0.8661498350839919, + "compression/movement_sparsity/model_sparsity": 0.8363949236602973, + "compression_loss": 96.06439208984375, + "distillation_loss": 4.059858798980713, + "epoch": 3.62, + "learning_rate": 3.5455996994458533e-05, + "loss": 100.2131, + "step": 4280, + "task_loss": 1.8517813682556152 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9021125157909847, + "compression/movement_sparsity/importance_threshold": -0.0006855779815568485, + "compression/movement_sparsity/linear_layer_sparsity": 0.8662729759631674, + "compression/movement_sparsity/model_sparsity": 0.8365138342714475, + "compression_loss": 96.08303833007812, + "distillation_loss": 5.325155258178711, + "epoch": 3.62, + "learning_rate": 3.5451300835916227e-05, + "loss": 100.6121, + "step": 4281, + "task_loss": 2.363600492477417 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9022917968530071, + "compression/movement_sparsity/importance_threshold": -0.0006843223445402698, + "compression/movement_sparsity/linear_layer_sparsity": 0.8664202752059733, + "compression/movement_sparsity/model_sparsity": 0.8366560733321174, + "compression_loss": 96.1015625, + "distillation_loss": 5.921926975250244, + "epoch": 3.62, + "learning_rate": 3.544660467737391e-05, + "loss": 100.867, + "step": 4282, + "task_loss": 2.688180446624756 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9024708588789395, + "compression/movement_sparsity/importance_threshold": -0.0006830682415943801, + "compression/movement_sparsity/linear_layer_sparsity": 0.8665083828806347, + "compression/movement_sparsity/model_sparsity": 0.8367411542371034, + "compression_loss": 96.1201400756836, + "distillation_loss": 4.978620529174805, + "epoch": 3.62, + "learning_rate": 3.544190851883159e-05, + "loss": 100.4941, + "step": 4283, + "task_loss": 2.4804446697235107 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9026497020026669, + "compression/movement_sparsity/importance_threshold": -0.000681815671781486, + "compression/movement_sparsity/linear_layer_sparsity": 0.8667356813641095, + "compression/movement_sparsity/model_sparsity": 0.8369606443184189, + "compression_loss": 96.13858032226562, + "distillation_loss": 5.0138092041015625, + "epoch": 3.62, + "learning_rate": 3.5437212360289286e-05, + "loss": 100.8471, + "step": 4284, + "task_loss": 3.699549913406372 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9028283263580746, + "compression/movement_sparsity/importance_threshold": -0.0006805646341638896, + "compression/movement_sparsity/linear_layer_sparsity": 0.8667421204146328, + "compression/movement_sparsity/model_sparsity": 0.836966862167748, + "compression_loss": 96.15709686279297, + "distillation_loss": 3.905506134033203, + "epoch": 3.62, + "learning_rate": 3.543251620174697e-05, + "loss": 99.9716, + "step": 4285, + "task_loss": 2.4717812538146973 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9030067320790476, + "compression/movement_sparsity/importance_threshold": -0.0006793151278038975, + "compression/movement_sparsity/linear_layer_sparsity": 0.8668749317937611, + "compression/movement_sparsity/model_sparsity": 0.8370951110674275, + "compression_loss": 96.17554473876953, + "distillation_loss": 3.2638678550720215, + "epoch": 3.62, + "learning_rate": 3.5427820043204665e-05, + "loss": 100.0794, + "step": 4286, + "task_loss": 2.090071439743042 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9031849192994712, + "compression/movement_sparsity/importance_threshold": -0.0006780671517638108, + "compression/movement_sparsity/linear_layer_sparsity": 0.8669994201038795, + "compression/movement_sparsity/model_sparsity": 0.8372153228211225, + "compression_loss": 96.19392395019531, + "distillation_loss": 2.7314634323120117, + "epoch": 3.62, + "learning_rate": 3.5423123884662345e-05, + "loss": 99.3524, + "step": 4287, + "task_loss": 1.4440038204193115 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9033628881532304, + "compression/movement_sparsity/importance_threshold": -0.0006768207051059344, + "compression/movement_sparsity/linear_layer_sparsity": 0.8671196634103195, + "compression/movement_sparsity/model_sparsity": 0.8373314354000747, + "compression_loss": 96.21239471435547, + "distillation_loss": 4.933556079864502, + "epoch": 3.62, + "learning_rate": 3.541842772612004e-05, + "loss": 100.0904, + "step": 4288, + "task_loss": 1.985803484916687 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9035406387742103, + "compression/movement_sparsity/importance_threshold": -0.000675575786892574, + "compression/movement_sparsity/linear_layer_sparsity": 0.8671821698970668, + "compression/movement_sparsity/model_sparsity": 0.8373917945967096, + "compression_loss": 96.230712890625, + "distillation_loss": 3.775048017501831, + "epoch": 3.63, + "learning_rate": 3.5413731567577724e-05, + "loss": 100.6447, + "step": 4289, + "task_loss": 2.8999719619750977 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9037181712962963, + "compression/movement_sparsity/importance_threshold": -0.0006743323961860307, + "compression/movement_sparsity/linear_layer_sparsity": 0.8672553604380157, + "compression/movement_sparsity/model_sparsity": 0.8374624708174165, + "compression_loss": 96.24906921386719, + "distillation_loss": 3.9476237297058105, + "epoch": 3.63, + "learning_rate": 3.540903540903541e-05, + "loss": 99.6322, + "step": 4290, + "task_loss": 2.2831263542175293 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9038954858533733, + "compression/movement_sparsity/importance_threshold": -0.0006730905320486095, + "compression/movement_sparsity/linear_layer_sparsity": 0.8672889865907488, + "compression/movement_sparsity/model_sparsity": 0.8374949418083572, + "compression_loss": 96.26739501953125, + "distillation_loss": 4.304094314575195, + "epoch": 3.63, + "learning_rate": 3.54043392504931e-05, + "loss": 100.6191, + "step": 4291, + "task_loss": 1.8310171365737915 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9040725825793265, + "compression/movement_sparsity/importance_threshold": -0.000671850193542615, + "compression/movement_sparsity/linear_layer_sparsity": 0.8673930249533718, + "compression/movement_sparsity/model_sparsity": 0.8375954061331645, + "compression_loss": 96.2857437133789, + "distillation_loss": 3.959082841873169, + "epoch": 3.63, + "learning_rate": 3.539964309195078e-05, + "loss": 100.2784, + "step": 4292, + "task_loss": 1.9013115167617798 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9042494616080411, + "compression/movement_sparsity/importance_threshold": -0.000670611379730352, + "compression/movement_sparsity/linear_layer_sparsity": 0.8675244531290542, + "compression/movement_sparsity/model_sparsity": 0.837722319346692, + "compression_loss": 96.30399322509766, + "distillation_loss": 4.970921039581299, + "epoch": 3.63, + "learning_rate": 3.5394946933408476e-05, + "loss": 100.9185, + "step": 4293, + "task_loss": 3.71980357170105 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9044261230734021, + "compression/movement_sparsity/importance_threshold": -0.0006693740896741227, + "compression/movement_sparsity/linear_layer_sparsity": 0.8676575387640381, + "compression/movement_sparsity/model_sparsity": 0.8378508330806949, + "compression_loss": 96.32227325439453, + "distillation_loss": 5.669394016265869, + "epoch": 3.63, + "learning_rate": 3.539025077486616e-05, + "loss": 100.9656, + "step": 4294, + "task_loss": 2.603111743927002 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9046025671092948, + "compression/movement_sparsity/importance_threshold": -0.0006681383224362326, + "compression/movement_sparsity/linear_layer_sparsity": 0.8677038641553034, + "compression/movement_sparsity/model_sparsity": 0.8378955670522567, + "compression_loss": 96.34054565429688, + "distillation_loss": 3.915592670440674, + "epoch": 3.63, + "learning_rate": 3.538555461632385e-05, + "loss": 100.3994, + "step": 4295, + "task_loss": 1.5478243827819824 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9047787938496042, + "compression/movement_sparsity/importance_threshold": -0.0006669040770789841, + "compression/movement_sparsity/linear_layer_sparsity": 0.8677480431963943, + "compression/movement_sparsity/model_sparsity": 0.8379382284073754, + "compression_loss": 96.35874938964844, + "distillation_loss": 3.8440425395965576, + "epoch": 3.63, + "learning_rate": 3.5380858457781535e-05, + "loss": 100.9256, + "step": 4296, + "task_loss": 2.1421358585357666 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9049548034282157, + "compression/movement_sparsity/importance_threshold": -0.0006656713526646818, + "compression/movement_sparsity/linear_layer_sparsity": 0.867913324083995, + "compression/movement_sparsity/model_sparsity": 0.8380978313880235, + "compression_loss": 96.3769760131836, + "distillation_loss": 6.172903537750244, + "epoch": 3.63, + "learning_rate": 3.537616229923922e-05, + "loss": 100.3617, + "step": 4297, + "task_loss": 2.9490866661071777 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9051305959790141, + "compression/movement_sparsity/importance_threshold": -0.0006644401482556296, + "compression/movement_sparsity/linear_layer_sparsity": 0.8680980294406745, + "compression/movement_sparsity/model_sparsity": 0.838276191547481, + "compression_loss": 96.39523315429688, + "distillation_loss": 3.86124849319458, + "epoch": 3.63, + "learning_rate": 3.5371466140696915e-05, + "loss": 101.3411, + "step": 4298, + "task_loss": 2.0283093452453613 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9053061716358847, + "compression/movement_sparsity/importance_threshold": -0.0006632104629141324, + "compression/movement_sparsity/linear_layer_sparsity": 0.8681946867435308, + "compression/movement_sparsity/model_sparsity": 0.8383695283746315, + "compression_loss": 96.41343688964844, + "distillation_loss": 6.689557075500488, + "epoch": 3.63, + "learning_rate": 3.53667699821546e-05, + "loss": 101.0556, + "step": 4299, + "task_loss": 2.874082088470459 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9054815305327127, + "compression/movement_sparsity/importance_threshold": -0.0006619822957024931, + "compression/movement_sparsity/linear_layer_sparsity": 0.8682772496802416, + "compression/movement_sparsity/model_sparsity": 0.8384492550204729, + "compression_loss": 96.43156433105469, + "distillation_loss": 3.4758646488189697, + "epoch": 3.63, + "learning_rate": 3.536207382361229e-05, + "loss": 100.6492, + "step": 4300, + "task_loss": 1.9366663694381714 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9056566728033831, + "compression/movement_sparsity/importance_threshold": -0.0006607556456830165, + "compression/movement_sparsity/linear_layer_sparsity": 0.8683327566805865, + "compression/movement_sparsity/model_sparsity": 0.8385028551845964, + "compression_loss": 96.44973754882812, + "distillation_loss": 6.7592010498046875, + "epoch": 3.64, + "learning_rate": 3.5357377665069974e-05, + "loss": 101.5295, + "step": 4301, + "task_loss": 3.4434211254119873 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9058315985817812, + "compression/movement_sparsity/importance_threshold": -0.0006595305119180056, + "compression/movement_sparsity/linear_layer_sparsity": 0.8684185749150618, + "compression/movement_sparsity/model_sparsity": 0.8385857252987098, + "compression_loss": 96.46785736083984, + "distillation_loss": 6.425994873046875, + "epoch": 3.64, + "learning_rate": 3.535268150652766e-05, + "loss": 100.966, + "step": 4302, + "task_loss": 5.2162885665893555 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.906006308001792, + "compression/movement_sparsity/importance_threshold": -0.0006583068934697652, + "compression/movement_sparsity/linear_layer_sparsity": 0.868503904258664, + "compression/movement_sparsity/model_sparsity": 0.8386681233168556, + "compression_loss": 96.48593139648438, + "distillation_loss": 2.6008460521698, + "epoch": 3.64, + "learning_rate": 3.534798534798535e-05, + "loss": 100.4335, + "step": 4303, + "task_loss": 1.2341915369033813 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9061808011973007, + "compression/movement_sparsity/importance_threshold": -0.0006570847894005992, + "compression/movement_sparsity/linear_layer_sparsity": 0.8686686008620506, + "compression/movement_sparsity/model_sparsity": 0.8388271620852498, + "compression_loss": 96.50395202636719, + "distillation_loss": 4.709802627563477, + "epoch": 3.64, + "learning_rate": 3.534328918944303e-05, + "loss": 100.8767, + "step": 4304, + "task_loss": 2.802543878555298 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9063550783021925, + "compression/movement_sparsity/importance_threshold": -0.0006558641987728115, + "compression/movement_sparsity/linear_layer_sparsity": 0.8687918132862319, + "compression/movement_sparsity/model_sparsity": 0.8389461417836147, + "compression_loss": 96.5219497680664, + "distillation_loss": 5.595571517944336, + "epoch": 3.64, + "learning_rate": 3.5338593030900726e-05, + "loss": 100.5052, + "step": 4305, + "task_loss": 3.3218724727630615 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9065291394503525, + "compression/movement_sparsity/importance_threshold": -0.0006546451206487051, + "compression/movement_sparsity/linear_layer_sparsity": 0.8689184598706924, + "compression/movement_sparsity/model_sparsity": 0.8390684376682885, + "compression_loss": 96.5399398803711, + "distillation_loss": 2.7710704803466797, + "epoch": 3.64, + "learning_rate": 3.533389687235841e-05, + "loss": 100.2822, + "step": 4306, + "task_loss": 1.3011189699172974 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9067029847756658, + "compression/movement_sparsity/importance_threshold": -0.0006534275540905857, + "compression/movement_sparsity/linear_layer_sparsity": 0.8689269379538815, + "compression/movement_sparsity/model_sparsity": 0.8390766245032385, + "compression_loss": 96.55785369873047, + "distillation_loss": 3.722095012664795, + "epoch": 3.64, + "learning_rate": 3.5329200713816105e-05, + "loss": 100.1439, + "step": 4307, + "task_loss": 2.2931594848632812 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9068766144120175, + "compression/movement_sparsity/importance_threshold": -0.0006522114981607563, + "compression/movement_sparsity/linear_layer_sparsity": 0.8689741218852166, + "compression/movement_sparsity/model_sparsity": 0.8391221875213775, + "compression_loss": 96.5757064819336, + "distillation_loss": 4.981956481933594, + "epoch": 3.64, + "learning_rate": 3.5324504555273785e-05, + "loss": 100.9473, + "step": 4308, + "task_loss": 2.4355037212371826 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9070500284932929, + "compression/movement_sparsity/importance_threshold": -0.00065099695192152, + "compression/movement_sparsity/linear_layer_sparsity": 0.869105752771749, + "compression/movement_sparsity/model_sparsity": 0.8392492964820134, + "compression_loss": 96.59356689453125, + "distillation_loss": 3.977102279663086, + "epoch": 3.64, + "learning_rate": 3.531980839673147e-05, + "loss": 100.9463, + "step": 4309, + "task_loss": 2.4195563793182373 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9072232271533769, + "compression/movement_sparsity/importance_threshold": -0.0006497839144351831, + "compression/movement_sparsity/linear_layer_sparsity": 0.8692262703340445, + "compression/movement_sparsity/model_sparsity": 0.8393656738952888, + "compression_loss": 96.6114501953125, + "distillation_loss": 6.106236457824707, + "epoch": 3.64, + "learning_rate": 3.5315112238189164e-05, + "loss": 101.4508, + "step": 4310, + "task_loss": 3.640273094177246 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9073962105261548, + "compression/movement_sparsity/importance_threshold": -0.0006485723847640479, + "compression/movement_sparsity/linear_layer_sparsity": 0.8693709820324734, + "compression/movement_sparsity/model_sparsity": 0.8395054143016913, + "compression_loss": 96.62923431396484, + "distillation_loss": 4.5087432861328125, + "epoch": 3.64, + "learning_rate": 3.531041607964685e-05, + "loss": 101.0761, + "step": 4311, + "task_loss": 2.4355404376983643 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9075689787455119, + "compression/movement_sparsity/importance_threshold": -0.0006473623619704183, + "compression/movement_sparsity/linear_layer_sparsity": 0.8695083722919739, + "compression/movement_sparsity/model_sparsity": 0.8396380847831161, + "compression_loss": 96.64703369140625, + "distillation_loss": 3.368297576904297, + "epoch": 3.64, + "learning_rate": 3.5305719921104544e-05, + "loss": 100.7962, + "step": 4312, + "task_loss": 2.0810179710388184 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.907741531945333, + "compression/movement_sparsity/importance_threshold": -0.0006461538451165991, + "compression/movement_sparsity/linear_layer_sparsity": 0.8696201375152248, + "compression/movement_sparsity/model_sparsity": 0.8397460105271183, + "compression_loss": 96.6648178100586, + "distillation_loss": 3.4189343452453613, + "epoch": 3.65, + "learning_rate": 3.530102376256222e-05, + "loss": 99.7394, + "step": 4313, + "task_loss": 1.8139153718948364 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9079138702595035, + "compression/movement_sparsity/importance_threshold": -0.0006449468332648933, + "compression/movement_sparsity/linear_layer_sparsity": 0.8697905696432442, + "compression/movement_sparsity/model_sparsity": 0.8399105877872297, + "compression_loss": 96.68257141113281, + "distillation_loss": 4.248610496520996, + "epoch": 3.65, + "learning_rate": 3.5296327604019916e-05, + "loss": 101.1133, + "step": 4314, + "task_loss": 2.6366236209869385 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9080859938219084, + "compression/movement_sparsity/importance_threshold": -0.0006437413254776073, + "compression/movement_sparsity/linear_layer_sparsity": 0.8698971478535735, + "compression/movement_sparsity/model_sparsity": 0.8400135047081613, + "compression_loss": 96.70030975341797, + "distillation_loss": 4.740636825561523, + "epoch": 3.65, + "learning_rate": 3.52916314454776e-05, + "loss": 101.2973, + "step": 4315, + "task_loss": 2.3883512020111084 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9082579027664329, + "compression/movement_sparsity/importance_threshold": -0.0006425373208170408, + "compression/movement_sparsity/linear_layer_sparsity": 0.8700576590741199, + "compression/movement_sparsity/model_sparsity": 0.8401685018744915, + "compression_loss": 96.71800231933594, + "distillation_loss": 4.079943656921387, + "epoch": 3.65, + "learning_rate": 3.528693528693529e-05, + "loss": 101.5221, + "step": 4316, + "task_loss": 2.5724759101867676 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.908429597226962, + "compression/movement_sparsity/importance_threshold": -0.0006413348183455028, + "compression/movement_sparsity/linear_layer_sparsity": 0.870175672561212, + "compression/movement_sparsity/model_sparsity": 0.8402824612352502, + "compression_loss": 96.73567962646484, + "distillation_loss": 5.691556930541992, + "epoch": 3.65, + "learning_rate": 3.5282239128392975e-05, + "loss": 101.5329, + "step": 4317, + "task_loss": 3.272876739501953 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9086010773373813, + "compression/movement_sparsity/importance_threshold": -0.0006401338171252921, + "compression/movement_sparsity/linear_layer_sparsity": 0.870365994200848, + "compression/movement_sparsity/model_sparsity": 0.8404662447410668, + "compression_loss": 96.75335693359375, + "distillation_loss": 3.6307127475738525, + "epoch": 3.65, + "learning_rate": 3.527754296985066e-05, + "loss": 100.7863, + "step": 4318, + "task_loss": 2.4787747859954834 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9087723432315753, + "compression/movement_sparsity/importance_threshold": -0.0006389343162187169, + "compression/movement_sparsity/linear_layer_sparsity": 0.8704239695038936, + "compression/movement_sparsity/model_sparsity": 0.8405222284140998, + "compression_loss": 96.77095794677734, + "distillation_loss": 4.005181312561035, + "epoch": 3.65, + "learning_rate": 3.5272846811308355e-05, + "loss": 100.9218, + "step": 4319, + "task_loss": 1.1956453323364258 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9089433950434296, + "compression/movement_sparsity/importance_threshold": -0.0006377363146880784, + "compression/movement_sparsity/linear_layer_sparsity": 0.8705706844624852, + "compression/movement_sparsity/model_sparsity": 0.8406639032625159, + "compression_loss": 96.78865051269531, + "distillation_loss": 4.763053894042969, + "epoch": 3.65, + "learning_rate": 3.526815065276604e-05, + "loss": 100.9945, + "step": 4320, + "task_loss": 2.3702237606048584 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9091142329068291, + "compression/movement_sparsity/importance_threshold": -0.0006365398115956832, + "compression/movement_sparsity/linear_layer_sparsity": 0.8706796236580064, + "compression/movement_sparsity/model_sparsity": 0.8407691000615348, + "compression_loss": 96.80622863769531, + "distillation_loss": 4.643828392028809, + "epoch": 3.65, + "learning_rate": 3.526345449422373e-05, + "loss": 101.0123, + "step": 4321, + "task_loss": 2.353712797164917 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9092848569556591, + "compression/movement_sparsity/importance_threshold": -0.0006353448060038326, + "compression/movement_sparsity/linear_layer_sparsity": 0.8708399202435354, + "compression/movement_sparsity/model_sparsity": 0.8409238899662208, + "compression_loss": 96.82382202148438, + "distillation_loss": 4.092759132385254, + "epoch": 3.65, + "learning_rate": 3.5258758335681414e-05, + "loss": 102.0031, + "step": 4322, + "task_loss": 1.0409988164901733 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9094552673238047, + "compression/movement_sparsity/importance_threshold": -0.0006341512969748322, + "compression/movement_sparsity/linear_layer_sparsity": 0.8708245857639557, + "compression/movement_sparsity/model_sparsity": 0.8409090822731891, + "compression_loss": 96.84142303466797, + "distillation_loss": 4.551883697509766, + "epoch": 3.65, + "learning_rate": 3.52540621771391e-05, + "loss": 100.6823, + "step": 4323, + "task_loss": 2.5566530227661133 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9096254641451509, + "compression/movement_sparsity/importance_threshold": -0.0006329592835709859, + "compression/movement_sparsity/linear_layer_sparsity": 0.8710269627370716, + "compression/movement_sparsity/model_sparsity": 0.841104506974694, + "compression_loss": 96.85894012451172, + "distillation_loss": 4.072083473205566, + "epoch": 3.65, + "learning_rate": 3.524936601859679e-05, + "loss": 101.1207, + "step": 4324, + "task_loss": 2.2178080081939697 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.909795447553583, + "compression/movement_sparsity/importance_threshold": -0.0006317687648545976, + "compression/movement_sparsity/linear_layer_sparsity": 0.871089624237998, + "compression/movement_sparsity/model_sparsity": 0.8411650158602944, + "compression_loss": 96.87652587890625, + "distillation_loss": 4.939325332641602, + "epoch": 3.66, + "learning_rate": 3.524466986005447e-05, + "loss": 101.4992, + "step": 4325, + "task_loss": 1.893997311592102 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9099652176829861, + "compression/movement_sparsity/importance_threshold": -0.0006305797398879704, + "compression/movement_sparsity/linear_layer_sparsity": 0.8713035438053853, + "compression/movement_sparsity/model_sparsity": 0.8413715866324484, + "compression_loss": 96.89399719238281, + "distillation_loss": 5.318375587463379, + "epoch": 3.66, + "learning_rate": 3.5239973701512166e-05, + "loss": 101.3714, + "step": 4326, + "task_loss": 2.883091926574707 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9101347746672453, + "compression/movement_sparsity/importance_threshold": -0.0006293922077334089, + "compression/movement_sparsity/linear_layer_sparsity": 0.8713783083364622, + "compression/movement_sparsity/model_sparsity": 0.8414437827718801, + "compression_loss": 96.91153717041016, + "distillation_loss": 5.299679756164551, + "epoch": 3.66, + "learning_rate": 3.523527754296985e-05, + "loss": 100.9041, + "step": 4327, + "task_loss": 2.580761194229126 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.910304118640246, + "compression/movement_sparsity/importance_threshold": -0.0006282061674532172, + "compression/movement_sparsity/linear_layer_sparsity": 0.8714548495685168, + "compression/movement_sparsity/model_sparsity": 0.8415176945771453, + "compression_loss": 96.92901611328125, + "distillation_loss": 4.32697057723999, + "epoch": 3.66, + "learning_rate": 3.523058138442754e-05, + "loss": 100.9766, + "step": 4328, + "task_loss": 2.1089797019958496 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9104732497358727, + "compression/movement_sparsity/importance_threshold": -0.0006270216181096999, + "compression/movement_sparsity/linear_layer_sparsity": 0.87157865820108, + "compression/movement_sparsity/model_sparsity": 0.8416372500023, + "compression_loss": 96.94647216796875, + "distillation_loss": 4.445263385772705, + "epoch": 3.66, + "learning_rate": 3.522588522588523e-05, + "loss": 101.4044, + "step": 4329, + "task_loss": 1.5468918085098267 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9106421680880112, + "compression/movement_sparsity/importance_threshold": -0.0006258385587651592, + "compression/movement_sparsity/linear_layer_sparsity": 0.8716285846909714, + "compression/movement_sparsity/model_sparsity": 0.8416854613636717, + "compression_loss": 96.9638671875, + "distillation_loss": 4.71537971496582, + "epoch": 3.66, + "learning_rate": 3.522118906734291e-05, + "loss": 101.8231, + "step": 4330, + "task_loss": 2.995622396469116 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9108108738305466, + "compression/movement_sparsity/importance_threshold": -0.0006246569884819, + "compression/movement_sparsity/linear_layer_sparsity": 0.87176033481918, + "compression/movement_sparsity/model_sparsity": 0.8418126854696657, + "compression_loss": 96.98127746582031, + "distillation_loss": 4.868564128875732, + "epoch": 3.66, + "learning_rate": 3.5216492908800604e-05, + "loss": 101.7384, + "step": 4331, + "task_loss": 2.103773832321167 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9109793670973635, + "compression/movement_sparsity/importance_threshold": -0.0006234769063222277, + "compression/movement_sparsity/linear_layer_sparsity": 0.8718979158653626, + "compression/movement_sparsity/model_sparsity": 0.8419455401836631, + "compression_loss": 96.99869537353516, + "distillation_loss": 4.327425956726074, + "epoch": 3.66, + "learning_rate": 3.521179675025829e-05, + "loss": 101.0718, + "step": 4332, + "task_loss": 2.1862993240356445 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9111476480223475, + "compression/movement_sparsity/importance_threshold": -0.0006222983113484438, + "compression/movement_sparsity/linear_layer_sparsity": 0.8720405766069581, + "compression/movement_sparsity/model_sparsity": 0.842083300089909, + "compression_loss": 97.01604461669922, + "distillation_loss": 4.899751663208008, + "epoch": 3.66, + "learning_rate": 3.520710059171598e-05, + "loss": 101.7338, + "step": 4333, + "task_loss": 2.4079816341400146 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9113157167393836, + "compression/movement_sparsity/importance_threshold": -0.0006211212026228539, + "compression/movement_sparsity/linear_layer_sparsity": 0.8722104363749309, + "compression/movement_sparsity/model_sparsity": 0.8422473246523022, + "compression_loss": 97.03340911865234, + "distillation_loss": 5.685111045837402, + "epoch": 3.66, + "learning_rate": 3.5202404433173664e-05, + "loss": 101.6567, + "step": 4334, + "task_loss": 2.3303985595703125 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9114835733823571, + "compression/movement_sparsity/importance_threshold": -0.000619945579207761, + "compression/movement_sparsity/linear_layer_sparsity": 0.8722908291131319, + "compression/movement_sparsity/model_sparsity": 0.842324955652629, + "compression_loss": 97.05079650878906, + "distillation_loss": 4.43789529800415, + "epoch": 3.66, + "learning_rate": 3.519770827463135e-05, + "loss": 101.0301, + "step": 4335, + "task_loss": 2.600677728652954 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9116512180851528, + "compression/movement_sparsity/importance_threshold": -0.0006187714401654706, + "compression/movement_sparsity/linear_layer_sparsity": 0.8722923434824217, + "compression/movement_sparsity/model_sparsity": 0.8423264179986749, + "compression_loss": 97.06805419921875, + "distillation_loss": 4.913663864135742, + "epoch": 3.66, + "learning_rate": 3.519301211608904e-05, + "loss": 101.0199, + "step": 4336, + "task_loss": 1.9883872270584106 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9118186509816562, + "compression/movement_sparsity/importance_threshold": -0.0006175987845582851, + "compression/movement_sparsity/linear_layer_sparsity": 0.8723394677929186, + "compression/movement_sparsity/model_sparsity": 0.8423719234441349, + "compression_loss": 97.08537292480469, + "distillation_loss": 3.683157444000244, + "epoch": 3.67, + "learning_rate": 3.518831595754673e-05, + "loss": 100.8494, + "step": 4337, + "task_loss": 1.5717813968658447 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9119858722057521, + "compression/movement_sparsity/importance_threshold": -0.00061642761144851, + "compression/movement_sparsity/linear_layer_sparsity": 0.8724589002559594, + "compression/movement_sparsity/model_sparsity": 0.842487253034653, + "compression_loss": 97.10261535644531, + "distillation_loss": 4.721992492675781, + "epoch": 3.67, + "learning_rate": 3.5183619799004416e-05, + "loss": 101.6085, + "step": 4338, + "task_loss": 3.078033685684204 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9121528818913259, + "compression/movement_sparsity/importance_threshold": -0.0006152579198984475, + "compression/movement_sparsity/linear_layer_sparsity": 0.8725738492319691, + "compression/movement_sparsity/model_sparsity": 0.8425982531597124, + "compression_loss": 97.11984252929688, + "distillation_loss": 5.826071262359619, + "epoch": 3.67, + "learning_rate": 3.51789236404621e-05, + "loss": 102.0375, + "step": 4339, + "task_loss": 2.6733133792877197 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9123196801722628, + "compression/movement_sparsity/importance_threshold": -0.0006140897089704023, + "compression/movement_sparsity/linear_layer_sparsity": 0.872663745531776, + "compression/movement_sparsity/model_sparsity": 0.8426850612450675, + "compression_loss": 97.13713836669922, + "distillation_loss": 6.338521957397461, + "epoch": 3.67, + "learning_rate": 3.517422748191979e-05, + "loss": 101.3408, + "step": 4340, + "task_loss": 2.889519691467285 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9124862671824476, + "compression/movement_sparsity/importance_threshold": -0.0006129229777266793, + "compression/movement_sparsity/linear_layer_sparsity": 0.8728442058847771, + "compression/movement_sparsity/model_sparsity": 0.842859322229782, + "compression_loss": 97.15434265136719, + "distillation_loss": 3.1071767807006836, + "epoch": 3.67, + "learning_rate": 3.516953132337748e-05, + "loss": 101.2222, + "step": 4341, + "task_loss": 1.503021478652954 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9126526430557658, + "compression/movement_sparsity/importance_threshold": -0.0006117577252295813, + "compression/movement_sparsity/linear_layer_sparsity": 0.8729242647462843, + "compression/movement_sparsity/model_sparsity": 0.8429366308231065, + "compression_loss": 97.17164611816406, + "distillation_loss": 4.599024772644043, + "epoch": 3.67, + "learning_rate": 3.516483516483517e-05, + "loss": 101.8892, + "step": 4342, + "task_loss": 1.7505542039871216 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9128188079261023, + "compression/movement_sparsity/importance_threshold": -0.0006105939505414125, + "compression/movement_sparsity/linear_layer_sparsity": 0.8730208028074642, + "compression/movement_sparsity/model_sparsity": 0.8430298525048991, + "compression_loss": 97.18876647949219, + "distillation_loss": 3.0789284706115723, + "epoch": 3.67, + "learning_rate": 3.5160139006292854e-05, + "loss": 101.9085, + "step": 4343, + "task_loss": 2.671783447265625 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9129847619273423, + "compression/movement_sparsity/importance_threshold": -0.0006094316527244766, + "compression/movement_sparsity/linear_layer_sparsity": 0.8731325561065475, + "compression/movement_sparsity/model_sparsity": 0.8431377667343656, + "compression_loss": 97.20599365234375, + "distillation_loss": 3.1726179122924805, + "epoch": 3.67, + "learning_rate": 3.515544284775054e-05, + "loss": 101.1661, + "step": 4344, + "task_loss": 2.655129909515381 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.913150505193371, + "compression/movement_sparsity/importance_threshold": -0.0006082708308410793, + "compression/movement_sparsity/linear_layer_sparsity": 0.873208477281885, + "compression/movement_sparsity/model_sparsity": 0.8432110797837694, + "compression_loss": 97.22312927246094, + "distillation_loss": 3.5077004432678223, + "epoch": 3.67, + "learning_rate": 3.5150746689208234e-05, + "loss": 101.5269, + "step": 4345, + "task_loss": 1.9847369194030762 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9133160378580736, + "compression/movement_sparsity/importance_threshold": -0.0006071114839535218, + "compression/movement_sparsity/linear_layer_sparsity": 0.8732438920597635, + "compression/movement_sparsity/model_sparsity": 0.8432452779550792, + "compression_loss": 97.24030303955078, + "distillation_loss": 4.9577836990356445, + "epoch": 3.67, + "learning_rate": 3.514605053066592e-05, + "loss": 101.5755, + "step": 4346, + "task_loss": 2.5230162143707275 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9134813600553351, + "compression/movement_sparsity/importance_threshold": -0.0006059536111241107, + "compression/movement_sparsity/linear_layer_sparsity": 0.8734023880959795, + "compression/movement_sparsity/model_sparsity": 0.8433983291648602, + "compression_loss": 97.2574234008789, + "distillation_loss": 4.377408027648926, + "epoch": 3.67, + "learning_rate": 3.51413543721236e-05, + "loss": 100.9028, + "step": 4347, + "task_loss": 2.3648786544799805 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9136464719190405, + "compression/movement_sparsity/importance_threshold": -0.0006047972114151491, + "compression/movement_sparsity/linear_layer_sparsity": 0.8735133663241664, + "compression/movement_sparsity/model_sparsity": 0.8435054949495001, + "compression_loss": 97.2745132446289, + "distillation_loss": 4.381985187530518, + "epoch": 3.67, + "learning_rate": 3.513665821358129e-05, + "loss": 101.8533, + "step": 4348, + "task_loss": 2.47784161567688 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9138113735830754, + "compression/movement_sparsity/importance_threshold": -0.000603642283888939, + "compression/movement_sparsity/linear_layer_sparsity": 0.873633132663901, + "compression/movement_sparsity/model_sparsity": 0.8436211469470204, + "compression_loss": 97.29158020019531, + "distillation_loss": 4.322001934051514, + "epoch": 3.68, + "learning_rate": 3.513196205503898e-05, + "loss": 101.2804, + "step": 4349, + "task_loss": 2.2859482765197754 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9139760651813245, + "compression/movement_sparsity/importance_threshold": -0.0006024888276077887, + "compression/movement_sparsity/linear_layer_sparsity": 0.8737948005287081, + "compression/movement_sparsity/model_sparsity": 0.8437772610233228, + "compression_loss": 97.30862426757812, + "distillation_loss": 4.027581691741943, + "epoch": 3.68, + "learning_rate": 3.512726589649667e-05, + "loss": 101.6502, + "step": 4350, + "task_loss": 3.063328981399536 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9141405468476731, + "compression/movement_sparsity/importance_threshold": -0.0006013368416339978, + "compression/movement_sparsity/linear_layer_sparsity": 0.8739063391927739, + "compression/movement_sparsity/model_sparsity": 0.843884967991145, + "compression_loss": 97.32561492919922, + "distillation_loss": 4.0175886154174805, + "epoch": 3.68, + "learning_rate": 3.512256973795435e-05, + "loss": 101.8392, + "step": 4351, + "task_loss": 1.8267052173614502 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9143048187160064, + "compression/movement_sparsity/importance_threshold": -0.0006001863250298728, + "compression/movement_sparsity/linear_layer_sparsity": 0.8740634997222146, + "compression/movement_sparsity/model_sparsity": 0.844036729572917, + "compression_loss": 97.34266662597656, + "distillation_loss": 5.036438941955566, + "epoch": 3.68, + "learning_rate": 3.5117873579412045e-05, + "loss": 101.6746, + "step": 4352, + "task_loss": 2.3102707862854004 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9144688809202095, + "compression/movement_sparsity/importance_threshold": -0.0005990372768577166, + "compression/movement_sparsity/linear_layer_sparsity": 0.8741300127292875, + "compression/movement_sparsity/model_sparsity": 0.844100957653579, + "compression_loss": 97.35956573486328, + "distillation_loss": 4.079525947570801, + "epoch": 3.68, + "learning_rate": 3.511317742086973e-05, + "loss": 101.1357, + "step": 4353, + "task_loss": 2.088468074798584 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9146327335941675, + "compression/movement_sparsity/importance_threshold": -0.0005978896961798342, + "compression/movement_sparsity/linear_layer_sparsity": 0.8742763341903471, + "compression/movement_sparsity/model_sparsity": 0.8442422525223138, + "compression_loss": 97.37655639648438, + "distillation_loss": 6.250157356262207, + "epoch": 3.68, + "learning_rate": 3.510848126232742e-05, + "loss": 101.7325, + "step": 4354, + "task_loss": 3.244389057159424 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9147963768717657, + "compression/movement_sparsity/importance_threshold": -0.0005967435820585275, + "compression/movement_sparsity/linear_layer_sparsity": 0.8744145829899173, + "compression/movement_sparsity/model_sparsity": 0.8443757520503157, + "compression_loss": 97.3934555053711, + "distillation_loss": 3.5031533241271973, + "epoch": 3.68, + "learning_rate": 3.5103785103785104e-05, + "loss": 102.3573, + "step": 4355, + "task_loss": 1.8085384368896484 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.914959810886889, + "compression/movement_sparsity/importance_threshold": -0.0005955989335561023, + "compression/movement_sparsity/linear_layer_sparsity": 0.8745141974863473, + "compression/movement_sparsity/model_sparsity": 0.8444719444823433, + "compression_loss": 97.4102783203125, + "distillation_loss": 5.592926979064941, + "epoch": 3.68, + "learning_rate": 3.509908894524279e-05, + "loss": 102.5116, + "step": 4356, + "task_loss": 3.1112921237945557 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9151230357734226, + "compression/movement_sparsity/importance_threshold": -0.0005944557497348634, + "compression/movement_sparsity/linear_layer_sparsity": 0.874623351316886, + "compression/movement_sparsity/model_sparsity": 0.8445773485430065, + "compression_loss": 97.42716979980469, + "distillation_loss": 5.326101303100586, + "epoch": 3.68, + "learning_rate": 3.509439278670048e-05, + "loss": 101.6659, + "step": 4357, + "task_loss": 2.8373169898986816 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9152860516652519, + "compression/movement_sparsity/importance_threshold": -0.000593314029657112, + "compression/movement_sparsity/linear_layer_sparsity": 0.8747176476345504, + "compression/movement_sparsity/model_sparsity": 0.8446684054920697, + "compression_loss": 97.44400787353516, + "distillation_loss": 3.7424263954162598, + "epoch": 3.68, + "learning_rate": 3.508969662815817e-05, + "loss": 101.0525, + "step": 4358, + "task_loss": 2.054537057876587 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9154488586962616, + "compression/movement_sparsity/importance_threshold": -0.0005921737723851555, + "compression/movement_sparsity/linear_layer_sparsity": 0.8748442345981727, + "compression/movement_sparsity/model_sparsity": 0.8447906438040645, + "compression_loss": 97.4608383178711, + "distillation_loss": 5.851455211639404, + "epoch": 3.68, + "learning_rate": 3.5085000469615856e-05, + "loss": 101.7026, + "step": 4359, + "task_loss": 4.171334266662598 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9156114570003372, + "compression/movement_sparsity/importance_threshold": -0.0005910349769812943, + "compression/movement_sparsity/linear_layer_sparsity": 0.875002981041909, + "compression/movement_sparsity/model_sparsity": 0.8449439368190973, + "compression_loss": 97.4776382446289, + "distillation_loss": 4.725736141204834, + "epoch": 3.69, + "learning_rate": 3.508030431107354e-05, + "loss": 101.7928, + "step": 4360, + "task_loss": 2.2290990352630615 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9157738467113636, + "compression/movement_sparsity/importance_threshold": -0.000589897642507835, + "compression/movement_sparsity/linear_layer_sparsity": 0.8750622441550592, + "compression/movement_sparsity/model_sparsity": 0.8450011640619961, + "compression_loss": 97.49446105957031, + "distillation_loss": 3.9074432849884033, + "epoch": 3.69, + "learning_rate": 3.507560815253123e-05, + "loss": 101.9479, + "step": 4361, + "task_loss": 2.1035819053649902 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9159360279632263, + "compression/movement_sparsity/importance_threshold": -0.0005887617680270804, + "compression/movement_sparsity/linear_layer_sparsity": 0.8752593148735771, + "compression/movement_sparsity/model_sparsity": 0.8451914647950726, + "compression_loss": 97.51126861572266, + "distillation_loss": 4.822877883911133, + "epoch": 3.69, + "learning_rate": 3.507091199398892e-05, + "loss": 102.6587, + "step": 4362, + "task_loss": 3.1632156372070312 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9160980008898099, + "compression/movement_sparsity/importance_threshold": -0.0005876273526013356, + "compression/movement_sparsity/linear_layer_sparsity": 0.8753526691819983, + "compression/movement_sparsity/model_sparsity": 0.845281612095808, + "compression_loss": 97.52800750732422, + "distillation_loss": 5.700374126434326, + "epoch": 3.69, + "learning_rate": 3.506621583544661e-05, + "loss": 102.7766, + "step": 4363, + "task_loss": 2.5474703311920166 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.916259765625, + "compression/movement_sparsity/importance_threshold": -0.0005864943952929025, + "compression/movement_sparsity/linear_layer_sparsity": 0.8753819072410414, + "compression/movement_sparsity/model_sparsity": 0.8453098457375762, + "compression_loss": 97.54472351074219, + "distillation_loss": 2.6779568195343018, + "epoch": 3.69, + "learning_rate": 3.5061519676904294e-05, + "loss": 100.8703, + "step": 4364, + "task_loss": 0.8352637887001038 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9164213223026816, + "compression/movement_sparsity/importance_threshold": -0.0005853628951640868, + "compression/movement_sparsity/linear_layer_sparsity": 0.8754710046216166, + "compression/movement_sparsity/model_sparsity": 0.845395882349033, + "compression_loss": 97.56146240234375, + "distillation_loss": 5.734720230102539, + "epoch": 3.69, + "learning_rate": 3.505682351836198e-05, + "loss": 102.056, + "step": 4365, + "task_loss": 3.1355488300323486 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9165826710567396, + "compression/movement_sparsity/importance_threshold": -0.0005842328512771925, + "compression/movement_sparsity/linear_layer_sparsity": 0.8754797450364937, + "compression/movement_sparsity/model_sparsity": 0.8454043225037704, + "compression_loss": 97.57810974121094, + "distillation_loss": 4.414103031158447, + "epoch": 3.69, + "learning_rate": 3.505212735981967e-05, + "loss": 101.6781, + "step": 4366, + "task_loss": 2.1873064041137695 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9167438120210597, + "compression/movement_sparsity/importance_threshold": -0.0005831042626945216, + "compression/movement_sparsity/linear_layer_sparsity": 0.8755191067138597, + "compression/movement_sparsity/model_sparsity": 0.8454423319864282, + "compression_loss": 97.59477996826172, + "distillation_loss": 4.569825172424316, + "epoch": 3.69, + "learning_rate": 3.504743120127736e-05, + "loss": 101.7669, + "step": 4367, + "task_loss": 3.6755456924438477 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9169047453295265, + "compression/movement_sparsity/importance_threshold": -0.0005819771284783807, + "compression/movement_sparsity/linear_layer_sparsity": 0.8755277040387252, + "compression/movement_sparsity/model_sparsity": 0.845450633966736, + "compression_loss": 97.61133575439453, + "distillation_loss": 3.863023281097412, + "epoch": 3.69, + "learning_rate": 3.504273504273504e-05, + "loss": 101.4991, + "step": 4368, + "task_loss": 2.6442606449127197 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9170654711160253, + "compression/movement_sparsity/importance_threshold": -0.0005808514476910729, + "compression/movement_sparsity/linear_layer_sparsity": 0.8756684807618341, + "compression/movement_sparsity/model_sparsity": 0.8455865745763264, + "compression_loss": 97.62783813476562, + "distillation_loss": 7.302313327789307, + "epoch": 3.69, + "learning_rate": 3.503803888419273e-05, + "loss": 102.2783, + "step": 4369, + "task_loss": 4.270925521850586 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9172259895144413, + "compression/movement_sparsity/importance_threshold": -0.000579727219394901, + "compression/movement_sparsity/linear_layer_sparsity": 0.8757571727207096, + "compression/movement_sparsity/model_sparsity": 0.8456722196935662, + "compression_loss": 97.64445495605469, + "distillation_loss": 5.230157852172852, + "epoch": 3.69, + "learning_rate": 3.503334272565042e-05, + "loss": 102.6004, + "step": 4370, + "task_loss": 2.3331515789031982 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9173863006586598, + "compression/movement_sparsity/importance_threshold": -0.00057860444265217, + "compression/movement_sparsity/linear_layer_sparsity": 0.8758812436849608, + "compression/movement_sparsity/model_sparsity": 0.8457920284385084, + "compression_loss": 97.66097259521484, + "distillation_loss": 5.888511657714844, + "epoch": 3.69, + "learning_rate": 3.5028646567108106e-05, + "loss": 101.958, + "step": 4371, + "task_loss": 2.2261836528778076 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9175464046825655, + "compression/movement_sparsity/importance_threshold": -0.0005774831165251846, + "compression/movement_sparsity/linear_layer_sparsity": 0.8758866691812351, + "compression/movement_sparsity/model_sparsity": 0.8457972675522949, + "compression_loss": 97.67750549316406, + "distillation_loss": 3.242845058441162, + "epoch": 3.7, + "learning_rate": 3.502395040856579e-05, + "loss": 101.2045, + "step": 4372, + "task_loss": 1.6460928916931152 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.917706301720044, + "compression/movement_sparsity/importance_threshold": -0.0005763632400762478, + "compression/movement_sparsity/linear_layer_sparsity": 0.8759871422177349, + "compression/movement_sparsity/model_sparsity": 0.8458942890308997, + "compression_loss": 97.6939697265625, + "distillation_loss": 5.436985969543457, + "epoch": 3.7, + "learning_rate": 3.501925425002348e-05, + "loss": 102.1062, + "step": 4373, + "task_loss": 3.447542190551758 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9178659919049802, + "compression/movement_sparsity/importance_threshold": -0.0005752448123676627, + "compression/movement_sparsity/linear_layer_sparsity": 0.8759930923773852, + "compression/movement_sparsity/model_sparsity": 0.8459000347842612, + "compression_loss": 97.71038055419922, + "distillation_loss": 4.149409294128418, + "epoch": 3.7, + "learning_rate": 3.501455809148117e-05, + "loss": 102.2838, + "step": 4374, + "task_loss": 2.66291880607605 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9180254753712593, + "compression/movement_sparsity/importance_threshold": -0.0005741278324617348, + "compression/movement_sparsity/linear_layer_sparsity": 0.8760632541797546, + "compression/movement_sparsity/model_sparsity": 0.8459677863128763, + "compression_loss": 97.72676849365234, + "distillation_loss": 3.26629376411438, + "epoch": 3.7, + "learning_rate": 3.500986193293886e-05, + "loss": 101.4659, + "step": 4375, + "task_loss": 3.901474714279175 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9181847522527664, + "compression/movement_sparsity/importance_threshold": -0.000573012299420768, + "compression/movement_sparsity/linear_layer_sparsity": 0.8760599273369842, + "compression/movement_sparsity/model_sparsity": 0.8459645737573895, + "compression_loss": 97.74317169189453, + "distillation_loss": 3.4512977600097656, + "epoch": 3.7, + "learning_rate": 3.500516577439655e-05, + "loss": 102.0178, + "step": 4376, + "task_loss": 1.805649995803833 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9183438226833867, + "compression/movement_sparsity/importance_threshold": -0.0005718982123070655, + "compression/movement_sparsity/linear_layer_sparsity": 0.8761914032093373, + "compression/movement_sparsity/model_sparsity": 0.8460915330290601, + "compression_loss": 97.75955963134766, + "distillation_loss": 4.556659698486328, + "epoch": 3.7, + "learning_rate": 3.500046961585423e-05, + "loss": 101.8656, + "step": 4377, + "task_loss": 3.28719162940979 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9185026867970052, + "compression/movement_sparsity/importance_threshold": -0.000570785570182932, + "compression/movement_sparsity/linear_layer_sparsity": 0.8762492831190417, + "compression/movement_sparsity/model_sparsity": 0.8461474245858069, + "compression_loss": 97.77592468261719, + "distillation_loss": 4.052186012268066, + "epoch": 3.7, + "learning_rate": 3.499577345731192e-05, + "loss": 102.3659, + "step": 4378, + "task_loss": 2.1523499488830566 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9186613447275072, + "compression/movement_sparsity/importance_threshold": -0.0005696743721106713, + "compression/movement_sparsity/linear_layer_sparsity": 0.8763227717641816, + "compression/movement_sparsity/model_sparsity": 0.8462183886699086, + "compression_loss": 97.7922592163086, + "distillation_loss": 4.916721820831299, + "epoch": 3.7, + "learning_rate": 3.499107729876961e-05, + "loss": 102.4063, + "step": 4379, + "task_loss": 1.7476212978363037 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9188197966087779, + "compression/movement_sparsity/importance_threshold": -0.0005685646171525857, + "compression/movement_sparsity/linear_layer_sparsity": 0.876369943771349, + "compression/movement_sparsity/model_sparsity": 0.8462639401735118, + "compression_loss": 97.80854797363281, + "distillation_loss": 4.322973251342773, + "epoch": 3.7, + "learning_rate": 3.4986381140227296e-05, + "loss": 102.0809, + "step": 4380, + "task_loss": 1.897320032119751 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9189780425747021, + "compression/movement_sparsity/importance_threshold": -0.0005674563043709824, + "compression/movement_sparsity/linear_layer_sparsity": 0.8764157206509031, + "compression/movement_sparsity/model_sparsity": 0.8463081444764271, + "compression_loss": 97.82487487792969, + "distillation_loss": 4.618863582611084, + "epoch": 3.7, + "learning_rate": 3.498168498168498e-05, + "loss": 102.7742, + "step": 4381, + "task_loss": 2.2519044876098633 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9191360827591653, + "compression/movement_sparsity/importance_threshold": -0.0005663494328281629, + "compression/movement_sparsity/linear_layer_sparsity": 0.8765546491280285, + "compression/movement_sparsity/model_sparsity": 0.8464423003329693, + "compression_loss": 97.84112548828125, + "distillation_loss": 5.822296619415283, + "epoch": 3.7, + "learning_rate": 3.497698882314267e-05, + "loss": 103.0421, + "step": 4382, + "task_loss": 2.226498603820801 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9192939172960525, + "compression/movement_sparsity/importance_threshold": -0.0005652440015864318, + "compression/movement_sparsity/linear_layer_sparsity": 0.8766760490787293, + "compression/movement_sparsity/model_sparsity": 0.8465595298218935, + "compression_loss": 97.85743713378906, + "distillation_loss": 3.742464303970337, + "epoch": 3.7, + "learning_rate": 3.497229266460036e-05, + "loss": 101.7592, + "step": 4383, + "task_loss": 2.0629873275756836 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9194515463192489, + "compression/movement_sparsity/importance_threshold": -0.0005641400097080931, + "compression/movement_sparsity/linear_layer_sparsity": 0.8767725275190711, + "compression/movement_sparsity/model_sparsity": 0.8466526939310072, + "compression_loss": 97.87367248535156, + "distillation_loss": 5.440330505371094, + "epoch": 3.71, + "learning_rate": 3.496759650605805e-05, + "loss": 102.4077, + "step": 4384, + "task_loss": 2.4062983989715576 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9196089699626395, + "compression/movement_sparsity/importance_threshold": -0.0005630374562554516, + "compression/movement_sparsity/linear_layer_sparsity": 0.8768474947609977, + "compression/movement_sparsity/model_sparsity": 0.8467250858175475, + "compression_loss": 97.8899154663086, + "distillation_loss": 4.898593902587891, + "epoch": 3.71, + "learning_rate": 3.4962900347515735e-05, + "loss": 102.895, + "step": 4385, + "task_loss": 2.6820461750030518 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9197661883601096, + "compression/movement_sparsity/importance_threshold": -0.0005619363402908101, + "compression/movement_sparsity/linear_layer_sparsity": 0.8770033674803337, + "compression/movement_sparsity/model_sparsity": 0.8468756038294537, + "compression_loss": 97.90615844726562, + "distillation_loss": 3.1594228744506836, + "epoch": 3.71, + "learning_rate": 3.495820418897342e-05, + "loss": 102.0073, + "step": 4386, + "task_loss": 1.6229994297027588 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9199232016455442, + "compression/movement_sparsity/importance_threshold": -0.0005608366608764728, + "compression/movement_sparsity/linear_layer_sparsity": 0.8771186503330373, + "compression/movement_sparsity/model_sparsity": 0.8469869263615153, + "compression_loss": 97.92240142822266, + "distillation_loss": 4.717376232147217, + "epoch": 3.71, + "learning_rate": 3.495350803043111e-05, + "loss": 102.3721, + "step": 4387, + "task_loss": 2.2538952827453613 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9200800099528286, + "compression/movement_sparsity/importance_threshold": -0.0005597384170747443, + "compression/movement_sparsity/linear_layer_sparsity": 0.877221973245602, + "compression/movement_sparsity/model_sparsity": 0.847086699814175, + "compression_loss": 97.93862915039062, + "distillation_loss": 5.628116607666016, + "epoch": 3.71, + "learning_rate": 3.49488118718888e-05, + "loss": 102.9751, + "step": 4388, + "task_loss": 2.7923123836517334 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9202366134158478, + "compression/movement_sparsity/importance_threshold": -0.0005586416079479276, + "compression/movement_sparsity/linear_layer_sparsity": 0.877284813609043, + "compression/movement_sparsity/model_sparsity": 0.8471473814178122, + "compression_loss": 97.95476531982422, + "distillation_loss": 4.245274066925049, + "epoch": 3.71, + "learning_rate": 3.494411571334649e-05, + "loss": 101.7774, + "step": 4389, + "task_loss": 3.085179567337036 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9203930121684869, + "compression/movement_sparsity/importance_threshold": -0.0005575462325583293, + "compression/movement_sparsity/linear_layer_sparsity": 0.8773571217615869, + "compression/movement_sparsity/model_sparsity": 0.8472172055628703, + "compression_loss": 97.97088623046875, + "distillation_loss": 4.324873924255371, + "epoch": 3.71, + "learning_rate": 3.493941955480417e-05, + "loss": 102.3618, + "step": 4390, + "task_loss": 2.523642063140869 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9205492063446312, + "compression/movement_sparsity/importance_threshold": -0.0005564522899682497, + "compression/movement_sparsity/linear_layer_sparsity": 0.877529986419804, + "compression/movement_sparsity/model_sparsity": 0.8473841317882838, + "compression_loss": 97.98705291748047, + "distillation_loss": 4.431741714477539, + "epoch": 3.71, + "learning_rate": 3.493472339626186e-05, + "loss": 102.5287, + "step": 4391, + "task_loss": 2.067526340484619 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9207051960781659, + "compression/movement_sparsity/importance_threshold": -0.0005553597792399938, + "compression/movement_sparsity/linear_layer_sparsity": 0.8776264052393076, + "compression/movement_sparsity/model_sparsity": 0.8474772383247184, + "compression_loss": 98.00316619873047, + "distillation_loss": 5.192829132080078, + "epoch": 3.71, + "learning_rate": 3.4930027237719546e-05, + "loss": 102.5017, + "step": 4392, + "task_loss": 2.803555488586426 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9208609815029759, + "compression/movement_sparsity/importance_threshold": -0.0005542686994358679, + "compression/movement_sparsity/linear_layer_sparsity": 0.8777513466677961, + "compression/movement_sparsity/model_sparsity": 0.8475978876307736, + "compression_loss": 98.01921081542969, + "distillation_loss": 5.146759033203125, + "epoch": 3.71, + "learning_rate": 3.492533107917724e-05, + "loss": 102.5415, + "step": 4393, + "task_loss": 2.4057204723358154 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9210165627529465, + "compression/movement_sparsity/importance_threshold": -0.0005531790496181724, + "compression/movement_sparsity/linear_layer_sparsity": 0.877869240913212, + "compression/movement_sparsity/model_sparsity": 0.8477117318461742, + "compression_loss": 98.03527069091797, + "distillation_loss": 3.9340944290161133, + "epoch": 3.71, + "learning_rate": 3.492063492063492e-05, + "loss": 102.211, + "step": 4394, + "task_loss": 3.596043109893799 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9211719399619628, + "compression/movement_sparsity/importance_threshold": -0.0005520908288492148, + "compression/movement_sparsity/linear_layer_sparsity": 0.8779362785836607, + "compression/movement_sparsity/model_sparsity": 0.8477764665664111, + "compression_loss": 98.05123138427734, + "distillation_loss": 4.112605094909668, + "epoch": 3.71, + "learning_rate": 3.491593876209261e-05, + "loss": 101.8275, + "step": 4395, + "task_loss": 2.5289621353149414 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9213271132639099, + "compression/movement_sparsity/importance_threshold": -0.0005510040361912964, + "compression/movement_sparsity/linear_layer_sparsity": 0.878077079155105, + "compression/movement_sparsity/model_sparsity": 0.8479124302050731, + "compression_loss": 98.06721496582031, + "distillation_loss": 3.3166773319244385, + "epoch": 3.72, + "learning_rate": 3.49112426035503e-05, + "loss": 102.3046, + "step": 4396, + "task_loss": 1.3342349529266357 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.921482082792673, + "compression/movement_sparsity/importance_threshold": -0.0005499186707067236, + "compression/movement_sparsity/linear_layer_sparsity": 0.8780705327870729, + "compression/movement_sparsity/model_sparsity": 0.8479061087249219, + "compression_loss": 98.08320617675781, + "distillation_loss": 3.243734836578369, + "epoch": 3.72, + "learning_rate": 3.4906546445007984e-05, + "loss": 102.0052, + "step": 4397, + "task_loss": 2.555058717727661 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9216368486821372, + "compression/movement_sparsity/importance_threshold": -0.0005488347314577969, + "compression/movement_sparsity/linear_layer_sparsity": 0.8782110352543263, + "compression/movement_sparsity/model_sparsity": 0.848041784500189, + "compression_loss": 98.09909057617188, + "distillation_loss": 3.1129395961761475, + "epoch": 3.72, + "learning_rate": 3.490185028646567e-05, + "loss": 101.7379, + "step": 4398, + "task_loss": 2.2047152519226074 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9217914110661877, + "compression/movement_sparsity/importance_threshold": -0.0005477522175068245, + "compression/movement_sparsity/linear_layer_sparsity": 0.8782021517494376, + "compression/movement_sparsity/model_sparsity": 0.8480332061710221, + "compression_loss": 98.11495208740234, + "distillation_loss": 4.724053382873535, + "epoch": 3.72, + "learning_rate": 3.489715412792336e-05, + "loss": 102.721, + "step": 4399, + "task_loss": 3.069983720779419 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9219457700787095, + "compression/movement_sparsity/importance_threshold": -0.0005466711279161077, + "compression/movement_sparsity/linear_layer_sparsity": 0.8783199386773446, + "compression/movement_sparsity/model_sparsity": 0.8481469467556005, + "compression_loss": 98.13084411621094, + "distillation_loss": 4.570070266723633, + "epoch": 3.72, + "learning_rate": 3.489245796938105e-05, + "loss": 102.6567, + "step": 4400, + "task_loss": 1.9823681116104126 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9220999258535879, + "compression/movement_sparsity/importance_threshold": -0.0005455914617479504, + "compression/movement_sparsity/linear_layer_sparsity": 0.8784398838795937, + "compression/movement_sparsity/model_sparsity": 0.8482627714711578, + "compression_loss": 98.1467056274414, + "distillation_loss": 4.059892177581787, + "epoch": 3.72, + "learning_rate": 3.4887761810838736e-05, + "loss": 101.9054, + "step": 4401, + "task_loss": 4.341288089752197 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9222538785247081, + "compression/movement_sparsity/importance_threshold": -0.0005445132180646574, + "compression/movement_sparsity/linear_layer_sparsity": 0.8785101291511366, + "compression/movement_sparsity/model_sparsity": 0.8483306036015235, + "compression_loss": 98.16249084472656, + "distillation_loss": 6.237392425537109, + "epoch": 3.72, + "learning_rate": 3.488306565229642e-05, + "loss": 102.5395, + "step": 4402, + "task_loss": 2.472874164581299 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.922407628225955, + "compression/movement_sparsity/importance_threshold": -0.0005434363959285326, + "compression/movement_sparsity/linear_layer_sparsity": 0.8785222798779576, + "compression/movement_sparsity/model_sparsity": 0.848342336913498, + "compression_loss": 98.17829895019531, + "distillation_loss": 3.581615447998047, + "epoch": 3.72, + "learning_rate": 3.487836949375411e-05, + "loss": 102.1503, + "step": 4403, + "task_loss": 1.8551411628723145 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9225611750912138, + "compression/movement_sparsity/importance_threshold": -0.0005423609944018798, + "compression/movement_sparsity/linear_layer_sparsity": 0.8786094455433757, + "compression/movement_sparsity/model_sparsity": 0.8484265081701562, + "compression_loss": 98.19400024414062, + "distillation_loss": 5.7543110847473145, + "epoch": 3.72, + "learning_rate": 3.4873673335211795e-05, + "loss": 102.9049, + "step": 4404, + "task_loss": 3.0706069469451904 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9227145192543698, + "compression/movement_sparsity/importance_threshold": -0.0005412870125470039, + "compression/movement_sparsity/linear_layer_sparsity": 0.8786684582490056, + "compression/movement_sparsity/model_sparsity": 0.8484834936078033, + "compression_loss": 98.20977783203125, + "distillation_loss": 3.6487467288970947, + "epoch": 3.72, + "learning_rate": 3.486897717666949e-05, + "loss": 101.872, + "step": 4405, + "task_loss": 0.9840919971466064 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.922867660849308, + "compression/movement_sparsity/importance_threshold": -0.000540214449426207, + "compression/movement_sparsity/linear_layer_sparsity": 0.8787004984874431, + "compression/movement_sparsity/model_sparsity": 0.8485144331654833, + "compression_loss": 98.22547149658203, + "distillation_loss": 4.707341194152832, + "epoch": 3.72, + "learning_rate": 3.4864281018127175e-05, + "loss": 102.4773, + "step": 4406, + "task_loss": 2.048301935195923 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9230206000099137, + "compression/movement_sparsity/importance_threshold": -0.0005391433041017938, + "compression/movement_sparsity/linear_layer_sparsity": 0.8787702310197777, + "compression/movement_sparsity/model_sparsity": 0.8485817701708097, + "compression_loss": 98.24114227294922, + "distillation_loss": 3.378244638442993, + "epoch": 3.72, + "learning_rate": 3.485958485958486e-05, + "loss": 102.2148, + "step": 4407, + "task_loss": 1.2717474699020386 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9231733368700717, + "compression/movement_sparsity/importance_threshold": -0.0005380735756360701, + "compression/movement_sparsity/linear_layer_sparsity": 0.8788692254594906, + "compression/movement_sparsity/model_sparsity": 0.848677363846976, + "compression_loss": 98.25680541992188, + "distillation_loss": 4.324892997741699, + "epoch": 3.73, + "learning_rate": 3.485488870104255e-05, + "loss": 102.986, + "step": 4408, + "task_loss": 2.877359628677368 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9233258715636676, + "compression/movement_sparsity/importance_threshold": -0.000537005263091337, + "compression/movement_sparsity/linear_layer_sparsity": 0.878921345996227, + "compression/movement_sparsity/model_sparsity": 0.8487276938829339, + "compression_loss": 98.27249908447266, + "distillation_loss": 7.238514423370361, + "epoch": 3.73, + "learning_rate": 3.485019254250024e-05, + "loss": 103.388, + "step": 4409, + "task_loss": 3.8885645866394043 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9234782042245862, + "compression/movement_sparsity/importance_threshold": -0.0005359383655299012, + "compression/movement_sparsity/linear_layer_sparsity": 0.8790842182019653, + "compression/movement_sparsity/model_sparsity": 0.8488849709273516, + "compression_loss": 98.28812408447266, + "distillation_loss": 6.472870349884033, + "epoch": 3.73, + "learning_rate": 3.484549638395793e-05, + "loss": 103.4889, + "step": 4410, + "task_loss": 3.249234914779663 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9236303349867128, + "compression/movement_sparsity/importance_threshold": -0.0005348728820140638, + "compression/movement_sparsity/linear_layer_sparsity": 0.879155548572763, + "compression/movement_sparsity/model_sparsity": 0.8489538508804745, + "compression_loss": 98.30377197265625, + "distillation_loss": 5.607931613922119, + "epoch": 3.73, + "learning_rate": 3.4840800225415607e-05, + "loss": 103.7559, + "step": 4411, + "task_loss": 3.070171594619751 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9237822639839324, + "compression/movement_sparsity/importance_threshold": -0.0005338088116061315, + "compression/movement_sparsity/linear_layer_sparsity": 0.8793229758105381, + "compression/movement_sparsity/model_sparsity": 0.8491155264775657, + "compression_loss": 98.31938171386719, + "distillation_loss": 4.288600921630859, + "epoch": 3.73, + "learning_rate": 3.48361040668733e-05, + "loss": 103.0181, + "step": 4412, + "task_loss": 2.7368335723876953 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9239339913501302, + "compression/movement_sparsity/importance_threshold": -0.0005327461533684063, + "compression/movement_sparsity/linear_layer_sparsity": 0.8795080388922467, + "compression/movement_sparsity/model_sparsity": 0.849294232073097, + "compression_loss": 98.3349380493164, + "distillation_loss": 3.6252031326293945, + "epoch": 3.73, + "learning_rate": 3.4831407908330986e-05, + "loss": 102.781, + "step": 4413, + "task_loss": 2.3738551139831543 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9240855172191914, + "compression/movement_sparsity/importance_threshold": -0.0005316849063631938, + "compression/movement_sparsity/linear_layer_sparsity": 0.8795096009582071, + "compression/movement_sparsity/model_sparsity": 0.8492957404772861, + "compression_loss": 98.35049438476562, + "distillation_loss": 4.822930812835693, + "epoch": 3.73, + "learning_rate": 3.482671174978868e-05, + "loss": 102.8346, + "step": 4414, + "task_loss": 2.5510809421539307 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9242368417250012, + "compression/movement_sparsity/importance_threshold": -0.0005306250696527964, + "compression/movement_sparsity/linear_layer_sparsity": 0.8795390298039324, + "compression/movement_sparsity/model_sparsity": 0.849324158351627, + "compression_loss": 98.36602783203125, + "distillation_loss": 4.712005615234375, + "epoch": 3.73, + "learning_rate": 3.482201559124636e-05, + "loss": 103.0312, + "step": 4415, + "task_loss": 2.530864715576172 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9243879650014446, + "compression/movement_sparsity/importance_threshold": -0.0005295666422995195, + "compression/movement_sparsity/linear_layer_sparsity": 0.8796494237479051, + "compression/movement_sparsity/model_sparsity": 0.8494307599240128, + "compression_loss": 98.38152313232422, + "distillation_loss": 3.481534004211426, + "epoch": 3.73, + "learning_rate": 3.481731943270405e-05, + "loss": 102.2064, + "step": 4416, + "task_loss": 1.501293420791626 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9245388871824067, + "compression/movement_sparsity/importance_threshold": -0.0005285096233656663, + "compression/movement_sparsity/linear_layer_sparsity": 0.8796395028404321, + "compression/movement_sparsity/model_sparsity": 0.8494211798302318, + "compression_loss": 98.39698791503906, + "distillation_loss": 3.4733471870422363, + "epoch": 3.73, + "learning_rate": 3.481262327416174e-05, + "loss": 102.7247, + "step": 4417, + "task_loss": 2.9259755611419678 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9246896084017728, + "compression/movement_sparsity/importance_threshold": -0.0005274540119135406, + "compression/movement_sparsity/linear_layer_sparsity": 0.8797395585310647, + "compression/movement_sparsity/model_sparsity": 0.8495177983000838, + "compression_loss": 98.41239166259766, + "distillation_loss": 4.621831893920898, + "epoch": 3.73, + "learning_rate": 3.4807927115619424e-05, + "loss": 103.0856, + "step": 4418, + "task_loss": 1.6848464012145996 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.924840128793428, + "compression/movement_sparsity/importance_threshold": -0.0005263998070054463, + "compression/movement_sparsity/linear_layer_sparsity": 0.8797769407966031, + "compression/movement_sparsity/model_sparsity": 0.8495538963697996, + "compression_loss": 98.42784118652344, + "distillation_loss": 4.113286972045898, + "epoch": 3.73, + "learning_rate": 3.480323095707712e-05, + "loss": 102.6295, + "step": 4419, + "task_loss": 1.558012843132019 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9249904484912574, + "compression/movement_sparsity/importance_threshold": -0.0005253470077036882, + "compression/movement_sparsity/linear_layer_sparsity": 0.8798764360513567, + "compression/movement_sparsity/model_sparsity": 0.8496499736564693, + "compression_loss": 98.44325256347656, + "distillation_loss": 5.690581321716309, + "epoch": 3.74, + "learning_rate": 3.47985347985348e-05, + "loss": 103.4306, + "step": 4420, + "task_loss": 3.757448196411133 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9251405676291462, + "compression/movement_sparsity/importance_threshold": -0.0005242956130705701, + "compression/movement_sparsity/linear_layer_sparsity": 0.8800254285259669, + "compression/movement_sparsity/model_sparsity": 0.849793847781222, + "compression_loss": 98.4586410522461, + "distillation_loss": 3.2571897506713867, + "epoch": 3.74, + "learning_rate": 3.479383863999249e-05, + "loss": 102.8102, + "step": 4421, + "task_loss": 1.1265833377838135 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9252904863409794, + "compression/movement_sparsity/importance_threshold": -0.0005232456221683952, + "compression/movement_sparsity/linear_layer_sparsity": 0.8802376429373823, + "compression/movement_sparsity/model_sparsity": 0.8499987719747575, + "compression_loss": 98.47400665283203, + "distillation_loss": 4.661984443664551, + "epoch": 3.74, + "learning_rate": 3.4789142481450177e-05, + "loss": 103.2353, + "step": 4422, + "task_loss": 3.007985830307007 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9254402047606423, + "compression/movement_sparsity/importance_threshold": -0.0005221970340594682, + "compression/movement_sparsity/linear_layer_sparsity": 0.8802697547208257, + "compression/movement_sparsity/model_sparsity": 0.8500297806196522, + "compression_loss": 98.48930358886719, + "distillation_loss": 3.7220749855041504, + "epoch": 3.74, + "learning_rate": 3.478444632290786e-05, + "loss": 103.1177, + "step": 4423, + "task_loss": 2.948399305343628 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9255897230220199, + "compression/movement_sparsity/importance_threshold": -0.000521149847806093, + "compression/movement_sparsity/linear_layer_sparsity": 0.8803362558037309, + "compression/movement_sparsity/model_sparsity": 0.8500939971857784, + "compression_loss": 98.50460815429688, + "distillation_loss": 4.484265327453613, + "epoch": 3.74, + "learning_rate": 3.477975016436555e-05, + "loss": 102.7175, + "step": 4424, + "task_loss": 3.2189276218414307 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9257390412589975, + "compression/movement_sparsity/importance_threshold": -0.0005201040624705734, + "compression/movement_sparsity/linear_layer_sparsity": 0.8804072642220024, + "compression/movement_sparsity/model_sparsity": 0.8501625662464348, + "compression_loss": 98.51992797851562, + "distillation_loss": 5.456811904907227, + "epoch": 3.74, + "learning_rate": 3.4775054005823236e-05, + "loss": 103.5835, + "step": 4425, + "task_loss": 3.655853033065796 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9258881596054601, + "compression/movement_sparsity/importance_threshold": -0.0005190596771152134, + "compression/movement_sparsity/linear_layer_sparsity": 0.880610142010159, + "compression/movement_sparsity/model_sparsity": 0.8503584745584432, + "compression_loss": 98.53523254394531, + "distillation_loss": 4.379825115203857, + "epoch": 3.74, + "learning_rate": 3.477035784728093e-05, + "loss": 102.8895, + "step": 4426, + "task_loss": 2.307105541229248 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9260370781952929, + "compression/movement_sparsity/importance_threshold": -0.0005180166908023169, + "compression/movement_sparsity/linear_layer_sparsity": 0.8807165413579738, + "compression/movement_sparsity/model_sparsity": 0.8504612187613378, + "compression_loss": 98.55043029785156, + "distillation_loss": 4.824334144592285, + "epoch": 3.74, + "learning_rate": 3.4765661688738615e-05, + "loss": 103.2126, + "step": 4427, + "task_loss": 3.119638204574585 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.926185797162381, + "compression/movement_sparsity/importance_threshold": -0.0005169751025941877, + "compression/movement_sparsity/linear_layer_sparsity": 0.8807723583866773, + "compression/movement_sparsity/model_sparsity": 0.8505151183033921, + "compression_loss": 98.56568908691406, + "distillation_loss": 4.777392387390137, + "epoch": 3.74, + "learning_rate": 3.47609655301963e-05, + "loss": 103.1184, + "step": 4428, + "task_loss": 2.080717086791992 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9263343166406097, + "compression/movement_sparsity/importance_threshold": -0.0005159349115531307, + "compression/movement_sparsity/linear_layer_sparsity": 0.8807464829429075, + "compression/movement_sparsity/model_sparsity": 0.850490131760718, + "compression_loss": 98.58094024658203, + "distillation_loss": 4.61302375793457, + "epoch": 3.74, + "learning_rate": 3.475626937165399e-05, + "loss": 102.4243, + "step": 4429, + "task_loss": 2.870847463607788 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9264826367638639, + "compression/movement_sparsity/importance_threshold": -0.0005148961167414489, + "compression/movement_sparsity/linear_layer_sparsity": 0.880837631280316, + "compression/movement_sparsity/model_sparsity": 0.8505781488723314, + "compression_loss": 98.59613800048828, + "distillation_loss": 4.479694366455078, + "epoch": 3.74, + "learning_rate": 3.4751573213111674e-05, + "loss": 103.1985, + "step": 4430, + "task_loss": 2.971008777618408 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9266307576660289, + "compression/movement_sparsity/importance_threshold": -0.000513858717221447, + "compression/movement_sparsity/linear_layer_sparsity": 0.8809093193761428, + "compression/movement_sparsity/model_sparsity": 0.8506473742615283, + "compression_loss": 98.61127471923828, + "distillation_loss": 5.196210861206055, + "epoch": 3.75, + "learning_rate": 3.474687705456937e-05, + "loss": 103.2381, + "step": 4431, + "task_loss": 3.1009883880615234 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9267786794809898, + "compression/movement_sparsity/importance_threshold": -0.0005128227120554281, + "compression/movement_sparsity/linear_layer_sparsity": 0.8809243557515316, + "compression/movement_sparsity/model_sparsity": 0.8506618940911651, + "compression_loss": 98.62651824951172, + "distillation_loss": 4.621634483337402, + "epoch": 3.75, + "learning_rate": 3.474218089602705e-05, + "loss": 102.6086, + "step": 4432, + "task_loss": 3.353809356689453 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9269264023426318, + "compression/movement_sparsity/importance_threshold": -0.000511788100305696, + "compression/movement_sparsity/linear_layer_sparsity": 0.8809761781840771, + "compression/movement_sparsity/model_sparsity": 0.8507119362637282, + "compression_loss": 98.64166259765625, + "distillation_loss": 5.1853251457214355, + "epoch": 3.75, + "learning_rate": 3.473748473748474e-05, + "loss": 104.3603, + "step": 4433, + "task_loss": 3.257661819458008 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9270739263848399, + "compression/movement_sparsity/importance_threshold": -0.0005107548810345565, + "compression/movement_sparsity/linear_layer_sparsity": 0.8810888257957331, + "compression/movement_sparsity/model_sparsity": 0.8508207140833792, + "compression_loss": 98.65682220458984, + "distillation_loss": 3.831122636795044, + "epoch": 3.75, + "learning_rate": 3.4732788578942426e-05, + "loss": 102.2264, + "step": 4434, + "task_loss": 2.1604018211364746 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9272212517414994, + "compression/movement_sparsity/importance_threshold": -0.0005097230533043116, + "compression/movement_sparsity/linear_layer_sparsity": 0.8811399208540528, + "compression/movement_sparsity/model_sparsity": 0.8508700538692588, + "compression_loss": 98.67196655273438, + "distillation_loss": 3.9108176231384277, + "epoch": 3.75, + "learning_rate": 3.472809242040011e-05, + "loss": 102.5798, + "step": 4435, + "task_loss": 1.7637782096862793 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9273683785464952, + "compression/movement_sparsity/importance_threshold": -0.000508692616177267, + "compression/movement_sparsity/linear_layer_sparsity": 0.8811419837350538, + "compression/movement_sparsity/model_sparsity": 0.8508720458839513, + "compression_loss": 98.68704986572266, + "distillation_loss": 4.225275993347168, + "epoch": 3.75, + "learning_rate": 3.4723396261857806e-05, + "loss": 103.0752, + "step": 4436, + "task_loss": 2.394631862640381 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9275153069337128, + "compression/movement_sparsity/importance_threshold": -0.0005076635687157258, + "compression/movement_sparsity/linear_layer_sparsity": 0.881258387459515, + "compression/movement_sparsity/model_sparsity": 0.8509844507823776, + "compression_loss": 98.70206451416016, + "distillation_loss": 2.862515449523926, + "epoch": 3.75, + "learning_rate": 3.4718700103315485e-05, + "loss": 102.6969, + "step": 4437, + "task_loss": 1.615430474281311 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9276620370370371, + "compression/movement_sparsity/importance_threshold": -0.0005066359099819909, + "compression/movement_sparsity/linear_layer_sparsity": 0.8814404575751471, + "compression/movement_sparsity/model_sparsity": 0.8511602662294244, + "compression_loss": 98.71717834472656, + "distillation_loss": 5.4086174964904785, + "epoch": 3.75, + "learning_rate": 3.471400394477318e-05, + "loss": 103.4409, + "step": 4438, + "task_loss": 2.782451868057251 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9278085689903532, + "compression/movement_sparsity/importance_threshold": -0.000505609639038368, + "compression/movement_sparsity/linear_layer_sparsity": 0.8814752642204762, + "compression/movement_sparsity/model_sparsity": 0.8511938771594086, + "compression_loss": 98.73219299316406, + "distillation_loss": 3.788233757019043, + "epoch": 3.75, + "learning_rate": 3.4709307786230865e-05, + "loss": 102.6537, + "step": 4439, + "task_loss": 1.8180707693099976 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9279549029275463, + "compression/movement_sparsity/importance_threshold": -0.000504584754947161, + "compression/movement_sparsity/linear_layer_sparsity": 0.8815898673956244, + "compression/movement_sparsity/model_sparsity": 0.85130454336293, + "compression_loss": 98.74718475341797, + "distillation_loss": 4.155143737792969, + "epoch": 3.75, + "learning_rate": 3.470461162768856e-05, + "loss": 103.1576, + "step": 4440, + "task_loss": 1.5821231603622437 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9281010389825016, + "compression/movement_sparsity/importance_threshold": -0.0005035612567706729, + "compression/movement_sparsity/linear_layer_sparsity": 0.8816934764882124, + "compression/movement_sparsity/model_sparsity": 0.8514045931644488, + "compression_loss": 98.76216125488281, + "distillation_loss": 5.316849231719971, + "epoch": 3.75, + "learning_rate": 3.469991546914624e-05, + "loss": 103.7195, + "step": 4441, + "task_loss": 2.2870032787323 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9282469772891043, + "compression/movement_sparsity/importance_threshold": -0.0005025391435712068, + "compression/movement_sparsity/linear_layer_sparsity": 0.8818606890909702, + "compression/movement_sparsity/model_sparsity": 0.8515660614998957, + "compression_loss": 98.77710723876953, + "distillation_loss": 3.302790403366089, + "epoch": 3.75, + "learning_rate": 3.4695219310603924e-05, + "loss": 103.012, + "step": 4442, + "task_loss": 1.8679653406143188 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9283927179812393, + "compression/movement_sparsity/importance_threshold": -0.00050151841441107, + "compression/movement_sparsity/linear_layer_sparsity": 0.8820303103755903, + "compression/movement_sparsity/model_sparsity": 0.851729855771573, + "compression_loss": 98.7920913696289, + "distillation_loss": 4.184677600860596, + "epoch": 3.76, + "learning_rate": 3.469052315206162e-05, + "loss": 103.2717, + "step": 4443, + "task_loss": 1.7728079557418823 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.928538261192792, + "compression/movement_sparsity/importance_threshold": -0.0005004990683525629, + "compression/movement_sparsity/linear_layer_sparsity": 0.8820840645232928, + "compression/movement_sparsity/model_sparsity": 0.8517817632989347, + "compression_loss": 98.80699157714844, + "distillation_loss": 3.965651035308838, + "epoch": 3.76, + "learning_rate": 3.46858269935193e-05, + "loss": 102.8307, + "step": 4444, + "task_loss": 2.5893805027008057 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9286836070576474, + "compression/movement_sparsity/importance_threshold": -0.0004994811044579921, + "compression/movement_sparsity/linear_layer_sparsity": 0.8821462252091785, + "compression/movement_sparsity/model_sparsity": 0.8518417885740317, + "compression_loss": 98.82188415527344, + "distillation_loss": 2.74373197555542, + "epoch": 3.76, + "learning_rate": 3.468113083497699e-05, + "loss": 102.1805, + "step": 4445, + "task_loss": 1.954851746559143 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9288287557096907, + "compression/movement_sparsity/importance_threshold": -0.0004984645217896598, + "compression/movement_sparsity/linear_layer_sparsity": 0.8821824508304563, + "compression/movement_sparsity/model_sparsity": 0.8518767697337756, + "compression_loss": 98.83677673339844, + "distillation_loss": 5.503054141998291, + "epoch": 3.76, + "learning_rate": 3.4676434676434676e-05, + "loss": 102.7916, + "step": 4446, + "task_loss": 2.6752309799194336 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.928973707282807, + "compression/movement_sparsity/importance_threshold": -0.0004974493194098706, + "compression/movement_sparsity/linear_layer_sparsity": 0.8822104845485682, + "compression/movement_sparsity/model_sparsity": 0.8519038404074285, + "compression_loss": 98.85163116455078, + "distillation_loss": 5.475777626037598, + "epoch": 3.76, + "learning_rate": 3.467173851789237e-05, + "loss": 102.9896, + "step": 4447, + "task_loss": 3.9551022052764893 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9291184619108814, + "compression/movement_sparsity/importance_threshold": -0.0004964354963809285, + "compression/movement_sparsity/linear_layer_sparsity": 0.8822520283486115, + "compression/movement_sparsity/model_sparsity": 0.8519439570501367, + "compression_loss": 98.86638641357422, + "distillation_loss": 4.3310770988464355, + "epoch": 3.76, + "learning_rate": 3.4667042359350055e-05, + "loss": 103.2151, + "step": 4448, + "task_loss": 2.13908052444458 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9292630197277991, + "compression/movement_sparsity/importance_threshold": -0.0004954230517651382, + "compression/movement_sparsity/linear_layer_sparsity": 0.8822478071932685, + "compression/movement_sparsity/model_sparsity": 0.8519398809044654, + "compression_loss": 98.88125610351562, + "distillation_loss": 3.1133248805999756, + "epoch": 3.76, + "learning_rate": 3.4662346200807735e-05, + "loss": 102.2601, + "step": 4449, + "task_loss": 0.6650223135948181 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9294073808674452, + "compression/movement_sparsity/importance_threshold": -0.0004944119846248029, + "compression/movement_sparsity/linear_layer_sparsity": 0.8823908733565635, + "compression/movement_sparsity/model_sparsity": 0.8520780323049283, + "compression_loss": 98.89591217041016, + "distillation_loss": 4.8558244705200195, + "epoch": 3.76, + "learning_rate": 3.465765004226543e-05, + "loss": 102.9615, + "step": 4450, + "task_loss": 2.8740787506103516 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.929551545463705, + "compression/movement_sparsity/importance_threshold": -0.0004934022940222263, + "compression/movement_sparsity/linear_layer_sparsity": 0.8823779952555167, + "compression/movement_sparsity/model_sparsity": 0.8520655966062702, + "compression_loss": 98.91071319580078, + "distillation_loss": 6.644176959991455, + "epoch": 3.76, + "learning_rate": 3.4652953883723114e-05, + "loss": 104.3045, + "step": 4451, + "task_loss": 3.268507957458496 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9296955136504633, + "compression/movement_sparsity/importance_threshold": -0.0004923939790197132, + "compression/movement_sparsity/linear_layer_sparsity": 0.8823653079411522, + "compression/movement_sparsity/model_sparsity": 0.8520533451401848, + "compression_loss": 98.92545318603516, + "distillation_loss": 4.655245304107666, + "epoch": 3.76, + "learning_rate": 3.464825772518081e-05, + "loss": 103.3562, + "step": 4452, + "task_loss": 2.356950283050537 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9298392855616057, + "compression/movement_sparsity/importance_threshold": -0.0004913870386795667, + "compression/movement_sparsity/linear_layer_sparsity": 0.8824989778603503, + "compression/movement_sparsity/model_sparsity": 0.8521824230864417, + "compression_loss": 98.94017028808594, + "distillation_loss": 4.210940361022949, + "epoch": 3.76, + "learning_rate": 3.4643561566638494e-05, + "loss": 102.7465, + "step": 4453, + "task_loss": 2.505951404571533 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9299828613310169, + "compression/movement_sparsity/importance_threshold": -0.0004903814720640916, + "compression/movement_sparsity/linear_layer_sparsity": 0.8825174006993477, + "compression/movement_sparsity/model_sparsity": 0.8522002130442442, + "compression_loss": 98.95484924316406, + "distillation_loss": 4.45470666885376, + "epoch": 3.76, + "learning_rate": 3.463886540809618e-05, + "loss": 103.4222, + "step": 4454, + "task_loss": 2.502286434173584 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9301262410925824, + "compression/movement_sparsity/importance_threshold": -0.0004893772782355909, + "compression/movement_sparsity/linear_layer_sparsity": 0.8826204732043921, + "compression/movement_sparsity/model_sparsity": 0.8522997446916522, + "compression_loss": 98.96949005126953, + "distillation_loss": 5.719060897827148, + "epoch": 3.77, + "learning_rate": 3.4634169249553866e-05, + "loss": 103.7986, + "step": 4455, + "task_loss": 2.194148540496826 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.930269424980187, + "compression/movement_sparsity/importance_threshold": -0.0004883744562563692, + "compression/movement_sparsity/linear_layer_sparsity": 0.8827367934596799, + "compression/movement_sparsity/model_sparsity": 0.8524120689883279, + "compression_loss": 98.9841079711914, + "distillation_loss": 4.041849613189697, + "epoch": 3.77, + "learning_rate": 3.462947309101155e-05, + "loss": 102.9149, + "step": 4456, + "task_loss": 2.3575069904327393 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.930412413127716, + "compression/movement_sparsity/importance_threshold": -0.00048737300518873154, + "compression/movement_sparsity/linear_layer_sparsity": 0.8828104132706637, + "compression/movement_sparsity/model_sparsity": 0.8524831597323235, + "compression_loss": 98.99867248535156, + "distillation_loss": 5.348252296447754, + "epoch": 3.77, + "learning_rate": 3.4624776932469246e-05, + "loss": 103.0928, + "step": 4457, + "task_loss": 2.9070706367492676 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9305552056690547, + "compression/movement_sparsity/importance_threshold": -0.00048637292409498076, + "compression/movement_sparsity/linear_layer_sparsity": 0.882902455920645, + "compression/movement_sparsity/model_sparsity": 0.8525720404341215, + "compression_loss": 99.0132064819336, + "distillation_loss": 4.84531307220459, + "epoch": 3.77, + "learning_rate": 3.4620080773926925e-05, + "loss": 103.7254, + "step": 4458, + "task_loss": 1.7577147483825684 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9306978027380881, + "compression/movement_sparsity/importance_threshold": -0.0004853742120374191, + "compression/movement_sparsity/linear_layer_sparsity": 0.8828593739029765, + "compression/movement_sparsity/model_sparsity": 0.8525304384162958, + "compression_loss": 99.02774810791016, + "distillation_loss": 4.883671760559082, + "epoch": 3.77, + "learning_rate": 3.461538461538462e-05, + "loss": 102.8411, + "step": 4459, + "task_loss": 2.4027936458587646 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9308402044687013, + "compression/movement_sparsity/importance_threshold": -0.0004843768680783539, + "compression/movement_sparsity/linear_layer_sparsity": 0.8829496040794772, + "compression/movement_sparsity/model_sparsity": 0.8526175689086531, + "compression_loss": 99.04227447509766, + "distillation_loss": 3.9085183143615723, + "epoch": 3.77, + "learning_rate": 3.4610688456842305e-05, + "loss": 103.5924, + "step": 4460, + "task_loss": 3.0719332695007324 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9309824109947794, + "compression/movement_sparsity/importance_threshold": -0.00048338089128008733, + "compression/movement_sparsity/linear_layer_sparsity": 0.8829862589707898, + "compression/movement_sparsity/model_sparsity": 0.8526529645916855, + "compression_loss": 99.05672454833984, + "distillation_loss": 4.918030261993408, + "epoch": 3.77, + "learning_rate": 3.460599229829999e-05, + "loss": 104.3093, + "step": 4461, + "task_loss": 2.458925485610962 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9311244224502077, + "compression/movement_sparsity/importance_threshold": -0.00048238628070492416, + "compression/movement_sparsity/linear_layer_sparsity": 0.8831751377861419, + "compression/movement_sparsity/model_sparsity": 0.852835354838671, + "compression_loss": 99.07120513916016, + "distillation_loss": 4.217590808868408, + "epoch": 3.77, + "learning_rate": 3.460129613975768e-05, + "loss": 103.4413, + "step": 4462, + "task_loss": 1.9830992221832275 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9312662389688712, + "compression/movement_sparsity/importance_threshold": -0.0004813930354151674, + "compression/movement_sparsity/linear_layer_sparsity": 0.8832467424127952, + "compression/movement_sparsity/model_sparsity": 0.8529044996261173, + "compression_loss": 99.085693359375, + "distillation_loss": 4.841731071472168, + "epoch": 3.77, + "learning_rate": 3.4596599981215364e-05, + "loss": 103.5519, + "step": 4463, + "task_loss": 1.8403431177139282 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9314078606846552, + "compression/movement_sparsity/importance_threshold": -0.00048040115447312015, + "compression/movement_sparsity/linear_layer_sparsity": 0.8833865413541581, + "compression/movement_sparsity/model_sparsity": 0.8530394960437725, + "compression_loss": 99.10017395019531, + "distillation_loss": 5.62965202331543, + "epoch": 3.77, + "learning_rate": 3.459190382267306e-05, + "loss": 104.0416, + "step": 4464, + "task_loss": 3.3953192234039307 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9315492877314446, + "compression/movement_sparsity/importance_threshold": -0.00047941063694108884, + "compression/movement_sparsity/linear_layer_sparsity": 0.8834621644253047, + "compression/movement_sparsity/model_sparsity": 0.8531125212297814, + "compression_loss": 99.11463165283203, + "distillation_loss": 3.631840229034424, + "epoch": 3.77, + "learning_rate": 3.4587207664130743e-05, + "loss": 103.4229, + "step": 4465, + "task_loss": 2.492821216583252 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9316905202431248, + "compression/movement_sparsity/importance_threshold": -0.0004784214818813765, + "compression/movement_sparsity/linear_layer_sparsity": 0.8835139630095149, + "compression/movement_sparsity/model_sparsity": 0.8531625403732729, + "compression_loss": 99.12906646728516, + "distillation_loss": 4.295794486999512, + "epoch": 3.77, + "learning_rate": 3.458251150558843e-05, + "loss": 103.6775, + "step": 4466, + "task_loss": 1.5640379190444946 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9318315583535808, + "compression/movement_sparsity/importance_threshold": -0.00047743368835628623, + "compression/movement_sparsity/linear_layer_sparsity": 0.8835177668189907, + "compression/movement_sparsity/model_sparsity": 0.8531662135101914, + "compression_loss": 99.14353942871094, + "distillation_loss": 3.5572056770324707, + "epoch": 3.78, + "learning_rate": 3.4577815347046116e-05, + "loss": 103.1365, + "step": 4467, + "task_loss": 1.8369367122650146 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9319724021966976, + "compression/movement_sparsity/importance_threshold": -0.00047644725542812447, + "compression/movement_sparsity/linear_layer_sparsity": 0.8836164750786805, + "compression/movement_sparsity/model_sparsity": 0.8532615308374986, + "compression_loss": 99.15791320800781, + "distillation_loss": 3.693638324737549, + "epoch": 3.78, + "learning_rate": 3.45731191885038e-05, + "loss": 103.407, + "step": 4468, + "task_loss": 2.400531530380249 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9321130519063607, + "compression/movement_sparsity/importance_threshold": -0.00047546218215919166, + "compression/movement_sparsity/linear_layer_sparsity": 0.8836956634759502, + "compression/movement_sparsity/model_sparsity": 0.8533379988697102, + "compression_loss": 99.17223358154297, + "distillation_loss": 4.586825370788574, + "epoch": 3.78, + "learning_rate": 3.4568423029961496e-05, + "loss": 103.826, + "step": 4469, + "task_loss": 2.677643299102783 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.932253507616455, + "compression/movement_sparsity/importance_threshold": -0.00047447846761179344, + "compression/movement_sparsity/linear_layer_sparsity": 0.8837266901601387, + "compression/movement_sparsity/model_sparsity": 0.8533679596918475, + "compression_loss": 99.18660736083984, + "distillation_loss": 5.2589545249938965, + "epoch": 3.78, + "learning_rate": 3.456372687141918e-05, + "loss": 103.3893, + "step": 4470, + "task_loss": 2.635115623474121 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9323937694608657, + "compression/movement_sparsity/importance_threshold": -0.00047349611084823457, + "compression/movement_sparsity/linear_layer_sparsity": 0.8837526013764114, + "compression/movement_sparsity/model_sparsity": 0.8533929807781291, + "compression_loss": 99.20089721679688, + "distillation_loss": 4.475529193878174, + "epoch": 3.78, + "learning_rate": 3.455903071287687e-05, + "loss": 103.1814, + "step": 4471, + "task_loss": 2.3057191371917725 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9325338375734779, + "compression/movement_sparsity/importance_threshold": -0.0004725151109308181, + "compression/movement_sparsity/linear_layer_sparsity": 0.8837134901065659, + "compression/movement_sparsity/model_sparsity": 0.8533552131007229, + "compression_loss": 99.2151870727539, + "distillation_loss": 5.528947830200195, + "epoch": 3.78, + "learning_rate": 3.4554334554334555e-05, + "loss": 103.7802, + "step": 4472, + "task_loss": 3.3463382720947266 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9326737120881767, + "compression/movement_sparsity/importance_threshold": -0.0004715354669218479, + "compression/movement_sparsity/linear_layer_sparsity": 0.8837653363874466, + "compression/movement_sparsity/model_sparsity": 0.8534052783023577, + "compression_loss": 99.22945404052734, + "distillation_loss": 4.84735107421875, + "epoch": 3.78, + "learning_rate": 3.454963839579224e-05, + "loss": 103.5667, + "step": 4473, + "task_loss": 1.9125312566757202 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9328133931388474, + "compression/movement_sparsity/importance_threshold": -0.0004705571778836296, + "compression/movement_sparsity/linear_layer_sparsity": 0.8839293413891102, + "compression/movement_sparsity/model_sparsity": 0.8535636492276758, + "compression_loss": 99.24368286132812, + "distillation_loss": 4.769996166229248, + "epoch": 3.78, + "learning_rate": 3.4544942237249934e-05, + "loss": 103.9858, + "step": 4474, + "task_loss": 4.012063980102539 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.932952880859375, + "compression/movement_sparsity/importance_threshold": -0.00046958024287846456, + "compression/movement_sparsity/linear_layer_sparsity": 0.8839645415319712, + "compression/movement_sparsity/model_sparsity": 0.8535976401373412, + "compression_loss": 99.2579116821289, + "distillation_loss": 4.299485206604004, + "epoch": 3.78, + "learning_rate": 3.4540246078707614e-05, + "loss": 103.8881, + "step": 4475, + "task_loss": 1.764823317527771 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9330921753836448, + "compression/movement_sparsity/importance_threshold": -0.00046860466096865835, + "compression/movement_sparsity/linear_layer_sparsity": 0.883999491267312, + "compression/movement_sparsity/model_sparsity": 0.8536313892417551, + "compression_loss": 99.27217864990234, + "distillation_loss": 4.71946907043457, + "epoch": 3.78, + "learning_rate": 3.453554992016531e-05, + "loss": 103.7803, + "step": 4476, + "task_loss": 2.5901169776916504 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9332312768455415, + "compression/movement_sparsity/importance_threshold": -0.00046763043121651575, + "compression/movement_sparsity/linear_layer_sparsity": 0.8841489964811304, + "compression/movement_sparsity/model_sparsity": 0.853775758491547, + "compression_loss": 99.28636169433594, + "distillation_loss": 4.358282089233398, + "epoch": 3.78, + "learning_rate": 3.453085376162299e-05, + "loss": 103.1859, + "step": 4477, + "task_loss": 2.8471946716308594 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9333701853789509, + "compression/movement_sparsity/importance_threshold": -0.00046665755268433806, + "compression/movement_sparsity/linear_layer_sparsity": 0.8842540126254995, + "compression/movement_sparsity/model_sparsity": 0.8538771670082895, + "compression_loss": 99.30052947998047, + "distillation_loss": 4.1689982414245605, + "epoch": 3.78, + "learning_rate": 3.4526157603080686e-05, + "loss": 103.225, + "step": 4478, + "task_loss": 2.4021146297454834 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9335089011177578, + "compression/movement_sparsity/importance_threshold": -0.0004656860244344309, + "compression/movement_sparsity/linear_layer_sparsity": 0.8843395089074486, + "compression/movement_sparsity/model_sparsity": 0.8539597262299363, + "compression_loss": 99.31465148925781, + "distillation_loss": 3.7200088500976562, + "epoch": 3.79, + "learning_rate": 3.4521461444538366e-05, + "loss": 103.6094, + "step": 4479, + "task_loss": 2.500925064086914 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9336474241958472, + "compression/movement_sparsity/importance_threshold": -0.0004647158455290991, + "compression/movement_sparsity/linear_layer_sparsity": 0.8843972218788062, + "compression/movement_sparsity/model_sparsity": 0.8540154565831819, + "compression_loss": 99.32880401611328, + "distillation_loss": 5.050673961639404, + "epoch": 3.79, + "learning_rate": 3.451676528599606e-05, + "loss": 103.7124, + "step": 4480, + "task_loss": 2.6829442977905273 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9337857547471043, + "compression/movement_sparsity/importance_threshold": -0.00046374701503064646, + "compression/movement_sparsity/linear_layer_sparsity": 0.8844689576713035, + "compression/movement_sparsity/model_sparsity": 0.8540847280305219, + "compression_loss": 99.34294891357422, + "distillation_loss": 4.245477676391602, + "epoch": 3.79, + "learning_rate": 3.4512069127453745e-05, + "loss": 103.5173, + "step": 4481, + "task_loss": 2.4073433876037598 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9339238929054146, + "compression/movement_sparsity/importance_threshold": -0.0004627795320013752, + "compression/movement_sparsity/linear_layer_sparsity": 0.8845436745057098, + "compression/movement_sparsity/model_sparsity": 0.8541568781118105, + "compression_loss": 99.35710144042969, + "distillation_loss": 4.010534286499023, + "epoch": 3.79, + "learning_rate": 3.450737296891143e-05, + "loss": 104.1237, + "step": 4482, + "task_loss": 2.232236623764038 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9340618388046628, + "compression/movement_sparsity/importance_threshold": -0.00046181339550359097, + "compression/movement_sparsity/linear_layer_sparsity": 0.8846604955760384, + "compression/movement_sparsity/model_sparsity": 0.8542696860189896, + "compression_loss": 99.37120056152344, + "distillation_loss": 4.53864860534668, + "epoch": 3.79, + "learning_rate": 3.4502676810369125e-05, + "loss": 103.7125, + "step": 4483, + "task_loss": 1.998191475868225 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9341995925787342, + "compression/movement_sparsity/importance_threshold": -0.00046084860459959676, + "compression/movement_sparsity/linear_layer_sparsity": 0.8847598358166128, + "compression/movement_sparsity/model_sparsity": 0.8543656136166939, + "compression_loss": 99.38532257080078, + "distillation_loss": 3.650279998779297, + "epoch": 3.79, + "learning_rate": 3.4497980651826804e-05, + "loss": 103.4996, + "step": 4484, + "task_loss": 1.7187650203704834 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.934337154361514, + "compression/movement_sparsity/importance_threshold": -0.0004598851583516965, + "compression/movement_sparsity/linear_layer_sparsity": 0.8848687630879664, + "compression/movement_sparsity/model_sparsity": 0.854470798901177, + "compression_loss": 99.39935302734375, + "distillation_loss": 4.135272026062012, + "epoch": 3.79, + "learning_rate": 3.44932844932845e-05, + "loss": 103.6663, + "step": 4485, + "task_loss": 2.5874385833740234 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9344745242868874, + "compression/movement_sparsity/importance_threshold": -0.00045892305582219406, + "compression/movement_sparsity/linear_layer_sparsity": 0.8849573835018362, + "compression/movement_sparsity/model_sparsity": 0.8545563749312021, + "compression_loss": 99.41337585449219, + "distillation_loss": 4.7910871505737305, + "epoch": 3.79, + "learning_rate": 3.4488588334742184e-05, + "loss": 103.2293, + "step": 4486, + "task_loss": 3.0408992767333984 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9346117024887393, + "compression/movement_sparsity/importance_threshold": -0.0004579622960733951, + "compression/movement_sparsity/linear_layer_sparsity": 0.88496649356591, + "compression/movement_sparsity/model_sparsity": 0.8545651720365491, + "compression_loss": 99.42733001708984, + "distillation_loss": 4.788797855377197, + "epoch": 3.79, + "learning_rate": 3.448389217619987e-05, + "loss": 104.1743, + "step": 4487, + "task_loss": 3.5368692874908447 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.934748689100955, + "compression/movement_sparsity/importance_threshold": -0.00045700287816760263, + "compression/movement_sparsity/linear_layer_sparsity": 0.8851069125639899, + "compression/movement_sparsity/model_sparsity": 0.8547007672100656, + "compression_loss": 99.44137573242188, + "distillation_loss": 3.9559102058410645, + "epoch": 3.79, + "learning_rate": 3.4479196017657556e-05, + "loss": 103.6178, + "step": 4488, + "task_loss": 3.2537994384765625 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9348854842574196, + "compression/movement_sparsity/importance_threshold": -0.0004560448011671197, + "compression/movement_sparsity/linear_layer_sparsity": 0.885300012534685, + "compression/movement_sparsity/model_sparsity": 0.8548872336027225, + "compression_loss": 99.45539855957031, + "distillation_loss": 4.702653884887695, + "epoch": 3.79, + "learning_rate": 3.447449985911524e-05, + "loss": 103.5255, + "step": 4489, + "task_loss": 1.8245919942855835 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9350220880920183, + "compression/movement_sparsity/importance_threshold": -0.00045508806413425106, + "compression/movement_sparsity/linear_layer_sparsity": 0.8854365800266185, + "compression/movement_sparsity/model_sparsity": 0.8550191095811773, + "compression_loss": 99.46935272216797, + "distillation_loss": 4.110479354858398, + "epoch": 3.79, + "learning_rate": 3.4469803700572936e-05, + "loss": 103.9041, + "step": 4490, + "task_loss": 2.568847894668579 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9351585007386362, + "compression/movement_sparsity/importance_threshold": -0.0004541326661313006, + "compression/movement_sparsity/linear_layer_sparsity": 0.8855180220915715, + "compression/movement_sparsity/model_sparsity": 0.855097753860654, + "compression_loss": 99.48329162597656, + "distillation_loss": 4.633420944213867, + "epoch": 3.8, + "learning_rate": 3.446510754203062e-05, + "loss": 104.0522, + "step": 4491, + "task_loss": 2.521723747253418 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9352947223311583, + "compression/movement_sparsity/importance_threshold": -0.000453178606220574, + "compression/movement_sparsity/linear_layer_sparsity": 0.8855661957288203, + "compression/movement_sparsity/model_sparsity": 0.8551442725852639, + "compression_loss": 99.49725341796875, + "distillation_loss": 5.031313896179199, + "epoch": 3.8, + "learning_rate": 3.446041138348831e-05, + "loss": 103.5796, + "step": 4492, + "task_loss": 3.401355266571045 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.93543075300347, + "compression/movement_sparsity/importance_threshold": -0.00045222588346437165, + "compression/movement_sparsity/linear_layer_sparsity": 0.8856698882905819, + "compression/movement_sparsity/model_sparsity": 0.8552444029885333, + "compression_loss": 99.51112365722656, + "distillation_loss": 4.717130661010742, + "epoch": 3.8, + "learning_rate": 3.4455715224945995e-05, + "loss": 103.933, + "step": 4493, + "task_loss": 1.9160329103469849 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9355665928894563, + "compression/movement_sparsity/importance_threshold": -0.00045127449692500005, + "compression/movement_sparsity/linear_layer_sparsity": 0.8856541603134701, + "compression/movement_sparsity/model_sparsity": 0.8552292153158203, + "compression_loss": 99.52499389648438, + "distillation_loss": 4.162189483642578, + "epoch": 3.8, + "learning_rate": 3.445101906640368e-05, + "loss": 103.4923, + "step": 4494, + "task_loss": 2.042285442352295 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9357022421230023, + "compression/movement_sparsity/importance_threshold": -0.0004503244456647631, + "compression/movement_sparsity/linear_layer_sparsity": 0.8858037251481268, + "compression/movement_sparsity/model_sparsity": 0.8553736421382913, + "compression_loss": 99.53885650634766, + "distillation_loss": 5.512463092803955, + "epoch": 3.8, + "learning_rate": 3.4446322907861374e-05, + "loss": 103.6631, + "step": 4495, + "task_loss": 3.061004877090454 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9358377008379933, + "compression/movement_sparsity/importance_threshold": -0.000449375728745963, + "compression/movement_sparsity/linear_layer_sparsity": 0.885813049847218, + "compression/movement_sparsity/model_sparsity": 0.8553826465052825, + "compression_loss": 99.55267333984375, + "distillation_loss": 5.128453731536865, + "epoch": 3.8, + "learning_rate": 3.444162674931906e-05, + "loss": 103.9955, + "step": 4496, + "task_loss": 2.7442898750305176 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9359729691683143, + "compression/movement_sparsity/importance_threshold": -0.0004484283452309062, + "compression/movement_sparsity/linear_layer_sparsity": 0.8858931206328928, + "compression/movement_sparsity/model_sparsity": 0.8554599666131428, + "compression_loss": 99.56643676757812, + "distillation_loss": 5.407352447509766, + "epoch": 3.8, + "learning_rate": 3.443693059077675e-05, + "loss": 104.2077, + "step": 4497, + "task_loss": 2.7361977100372314 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9361080472478505, + "compression/movement_sparsity/importance_threshold": -0.00044748229418189577, + "compression/movement_sparsity/linear_layer_sparsity": 0.8860048023869703, + "compression/movement_sparsity/model_sparsity": 0.8555678117553945, + "compression_loss": 99.58023071289062, + "distillation_loss": 5.9511518478393555, + "epoch": 3.8, + "learning_rate": 3.443223443223443e-05, + "loss": 104.3643, + "step": 4498, + "task_loss": 2.9797017574310303 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.936242935210487, + "compression/movement_sparsity/importance_threshold": -0.0004465375746612347, + "compression/movement_sparsity/linear_layer_sparsity": 0.8860839788600725, + "compression/movement_sparsity/model_sparsity": 0.8556442682730703, + "compression_loss": 99.593994140625, + "distillation_loss": 5.060459136962891, + "epoch": 3.8, + "learning_rate": 3.442753827369212e-05, + "loss": 104.4925, + "step": 4499, + "task_loss": 1.8724145889282227 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9363776331901089, + "compression/movement_sparsity/importance_threshold": -0.0004455941857312278, + "compression/movement_sparsity/linear_layer_sparsity": 0.8862552695281617, + "compression/movement_sparsity/model_sparsity": 0.8558096745797589, + "compression_loss": 99.60780334472656, + "distillation_loss": 3.715949535369873, + "epoch": 3.8, + "learning_rate": 3.442284211514981e-05, + "loss": 103.7155, + "step": 4500, + "task_loss": 1.512458086013794 + }, + { + "epoch": 3.8, + "eval_accuracy": 0.5622178217821783, + "eval_loss": 103.5300521850586, + "eval_runtime": 231.5763, + "eval_samples_per_second": 109.035, + "eval_steps_per_second": 0.855, + "step": 4500 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9365121413206015, + "compression/movement_sparsity/importance_threshold": -0.00044465212645417897, + "compression/movement_sparsity/linear_layer_sparsity": 0.8863109315426858, + "compression/movement_sparsity/model_sparsity": 0.8558634244328478, + "compression_loss": 99.62156677246094, + "distillation_loss": 4.346074104309082, + "epoch": 3.8, + "learning_rate": 3.441814595660749e-05, + "loss": 103.7796, + "step": 4501, + "task_loss": 1.918161153793335 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9366464597358498, + "compression/movement_sparsity/importance_threshold": -0.00044371139589239206, + "compression/movement_sparsity/linear_layer_sparsity": 0.8864163650329222, + "compression/movement_sparsity/model_sparsity": 0.8559652359583432, + "compression_loss": 99.6353530883789, + "distillation_loss": 4.872063159942627, + "epoch": 3.81, + "learning_rate": 3.4413449798065185e-05, + "loss": 104.2235, + "step": 4502, + "task_loss": 3.331578016281128 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.936780588569739, + "compression/movement_sparsity/importance_threshold": -0.000442771993108171, + "compression/movement_sparsity/linear_layer_sparsity": 0.8864189525772991, + "compression/movement_sparsity/model_sparsity": 0.8559677346126106, + "compression_loss": 99.64900970458984, + "distillation_loss": 4.670708179473877, + "epoch": 3.81, + "learning_rate": 3.440875363952287e-05, + "loss": 104.3786, + "step": 4503, + "task_loss": 2.4652576446533203 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9369145279561542, + "compression/movement_sparsity/importance_threshold": -0.00044183391716382057, + "compression/movement_sparsity/linear_layer_sparsity": 0.8865227405324017, + "compression/movement_sparsity/model_sparsity": 0.8560679571321662, + "compression_loss": 99.66278839111328, + "distillation_loss": 3.938246965408325, + "epoch": 3.81, + "learning_rate": 3.4404057480980565e-05, + "loss": 103.2839, + "step": 4504, + "task_loss": 1.8551729917526245 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9370482780289806, + "compression/movement_sparsity/importance_threshold": -0.0004408971671216429, + "compression/movement_sparsity/linear_layer_sparsity": 0.8867438861453764, + "compression/movement_sparsity/model_sparsity": 0.8562815057130118, + "compression_loss": 99.67642211914062, + "distillation_loss": 4.400326251983643, + "epoch": 3.81, + "learning_rate": 3.4399361322438244e-05, + "loss": 104.1381, + "step": 4505, + "task_loss": 2.278688907623291 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9371818389221033, + "compression/movement_sparsity/importance_threshold": -0.0004399617420439445, + "compression/movement_sparsity/linear_layer_sparsity": 0.886746914883956, + "compression/movement_sparsity/model_sparsity": 0.8562844304051036, + "compression_loss": 99.6900863647461, + "distillation_loss": 4.146790504455566, + "epoch": 3.81, + "learning_rate": 3.439466516389593e-05, + "loss": 103.8147, + "step": 4506, + "task_loss": 2.4132161140441895 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9373152107694074, + "compression/movement_sparsity/importance_threshold": -0.0004390276409930276, + "compression/movement_sparsity/linear_layer_sparsity": 0.8869162261402175, + "compression/movement_sparsity/model_sparsity": 0.8564479252988503, + "compression_loss": 99.7037124633789, + "distillation_loss": 5.367804050445557, + "epoch": 3.81, + "learning_rate": 3.4389969005353624e-05, + "loss": 104.2516, + "step": 4507, + "task_loss": 3.248568296432495 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9374483937047781, + "compression/movement_sparsity/importance_threshold": -0.00043809486303119597, + "compression/movement_sparsity/linear_layer_sparsity": 0.8870868729032544, + "compression/movement_sparsity/model_sparsity": 0.856612709820606, + "compression_loss": 99.71730041503906, + "distillation_loss": 3.691837787628174, + "epoch": 3.81, + "learning_rate": 3.438527284681131e-05, + "loss": 104.44, + "step": 4508, + "task_loss": 1.766798973083496 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9375813878621005, + "compression/movement_sparsity/importance_threshold": -0.0004371634072207545, + "compression/movement_sparsity/linear_layer_sparsity": 0.8872165005296239, + "compression/movement_sparsity/model_sparsity": 0.8567378843392284, + "compression_loss": 99.73088836669922, + "distillation_loss": 5.886437892913818, + "epoch": 3.81, + "learning_rate": 3.4380576688268997e-05, + "loss": 104.1922, + "step": 4509, + "task_loss": 3.457476854324341 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9377141933752599, + "compression/movement_sparsity/importance_threshold": -0.000436233272624007, + "compression/movement_sparsity/linear_layer_sparsity": 0.8872866861803285, + "compression/movement_sparsity/model_sparsity": 0.8568056588969151, + "compression_loss": 99.74446868896484, + "distillation_loss": 6.532523155212402, + "epoch": 3.81, + "learning_rate": 3.437588052972668e-05, + "loss": 103.7783, + "step": 4510, + "task_loss": 3.3756752014160156 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9378468103781411, + "compression/movement_sparsity/importance_threshold": -0.00043530445830325826, + "compression/movement_sparsity/linear_layer_sparsity": 0.8872824054041473, + "compression/movement_sparsity/model_sparsity": 0.8568015251785648, + "compression_loss": 99.75796508789062, + "distillation_loss": 4.524733543395996, + "epoch": 3.81, + "learning_rate": 3.4371184371184376e-05, + "loss": 104.1183, + "step": 4511, + "task_loss": 2.351926803588867 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9379792390046297, + "compression/movement_sparsity/importance_threshold": -0.0004343769633208096, + "compression/movement_sparsity/linear_layer_sparsity": 0.8873526029790195, + "compression/movement_sparsity/model_sparsity": 0.8568693112507872, + "compression_loss": 99.77149963378906, + "distillation_loss": 3.206489086151123, + "epoch": 3.81, + "learning_rate": 3.436648821264206e-05, + "loss": 103.5867, + "step": 4512, + "task_loss": 2.998471260070801 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9381114793886104, + "compression/movement_sparsity/importance_threshold": -0.00043345078673896753, + "compression/movement_sparsity/linear_layer_sparsity": 0.8874244222406904, + "compression/movement_sparsity/model_sparsity": 0.8569386632998778, + "compression_loss": 99.78498077392578, + "distillation_loss": 4.093575954437256, + "epoch": 3.81, + "learning_rate": 3.436179205409975e-05, + "loss": 103.7978, + "step": 4513, + "task_loss": 1.60451340675354 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9382435316639686, + "compression/movement_sparsity/importance_threshold": -0.00043252592762003505, + "compression/movement_sparsity/linear_layer_sparsity": 0.8875451663621711, + "compression/movement_sparsity/model_sparsity": 0.8570552594893334, + "compression_loss": 99.79851531982422, + "distillation_loss": 4.923433303833008, + "epoch": 3.82, + "learning_rate": 3.4357095895557435e-05, + "loss": 104.6617, + "step": 4514, + "task_loss": 2.323650598526001 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9383753959645893, + "compression/movement_sparsity/importance_threshold": -0.00043160238502631693, + "compression/movement_sparsity/linear_layer_sparsity": 0.88754199453358, + "compression/movement_sparsity/model_sparsity": 0.857052196622812, + "compression_loss": 99.81199645996094, + "distillation_loss": 3.5554633140563965, + "epoch": 3.82, + "learning_rate": 3.435239973701512e-05, + "loss": 103.4666, + "step": 4515, + "task_loss": 2.3709278106689453 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9385070724243578, + "compression/movement_sparsity/importance_threshold": -0.00043068015802011535, + "compression/movement_sparsity/linear_layer_sparsity": 0.8875698493891774, + "compression/movement_sparsity/model_sparsity": 0.857079094578428, + "compression_loss": 99.825439453125, + "distillation_loss": 3.8403117656707764, + "epoch": 3.82, + "learning_rate": 3.4347703578472815e-05, + "loss": 103.2009, + "step": 4516, + "task_loss": 1.60381281375885 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9386385611771592, + "compression/movement_sparsity/importance_threshold": -0.0004297592456637359, + "compression/movement_sparsity/linear_layer_sparsity": 0.8876320339233984, + "compression/movement_sparsity/model_sparsity": 0.8571391428825965, + "compression_loss": 99.83885192871094, + "distillation_loss": 4.844111442565918, + "epoch": 3.82, + "learning_rate": 3.43430074199305e-05, + "loss": 104.2777, + "step": 4517, + "task_loss": 2.194101095199585 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9387698623568784, + "compression/movement_sparsity/importance_threshold": -0.00042883964701948257, + "compression/movement_sparsity/linear_layer_sparsity": 0.8876756048319399, + "compression/movement_sparsity/model_sparsity": 0.8571812169963898, + "compression_loss": 99.85223388671875, + "distillation_loss": 5.410505294799805, + "epoch": 3.82, + "learning_rate": 3.433831126138819e-05, + "loss": 104.8502, + "step": 4518, + "task_loss": 2.6061019897460938 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9389009760974009, + "compression/movement_sparsity/importance_threshold": -0.00042792136114965917, + "compression/movement_sparsity/linear_layer_sparsity": 0.8877567249443666, + "compression/movement_sparsity/model_sparsity": 0.8572595503834001, + "compression_loss": 99.8655776977539, + "distillation_loss": 2.262774705886841, + "epoch": 3.82, + "learning_rate": 3.4333615102845874e-05, + "loss": 103.7813, + "step": 4519, + "task_loss": 1.1959831714630127 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9390319025326116, + "compression/movement_sparsity/importance_threshold": -0.00042700438711656876, + "compression/movement_sparsity/linear_layer_sparsity": 0.8878679655042415, + "compression/movement_sparsity/model_sparsity": 0.8573669694878273, + "compression_loss": 99.87895965576172, + "distillation_loss": 5.981935501098633, + "epoch": 3.82, + "learning_rate": 3.432891894430356e-05, + "loss": 104.8295, + "step": 4520, + "task_loss": 3.0067951679229736 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9391626417963957, + "compression/movement_sparsity/importance_threshold": -0.0004260887239825161, + "compression/movement_sparsity/linear_layer_sparsity": 0.8879079949349952, + "compression/movement_sparsity/model_sparsity": 0.8574056237844896, + "compression_loss": 99.89231872558594, + "distillation_loss": 4.2976179122924805, + "epoch": 3.82, + "learning_rate": 3.432422278576125e-05, + "loss": 104.2309, + "step": 4521, + "task_loss": 2.67098069190979 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9392931940226384, + "compression/movement_sparsity/importance_threshold": -0.0004251743708098051, + "compression/movement_sparsity/linear_layer_sparsity": 0.8879076014374632, + "compression/movement_sparsity/model_sparsity": 0.8574052438048084, + "compression_loss": 99.9056396484375, + "distillation_loss": 5.427758693695068, + "epoch": 3.82, + "learning_rate": 3.431952662721893e-05, + "loss": 104.011, + "step": 4522, + "task_loss": 2.4527268409729004 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9394235593452248, + "compression/movement_sparsity/importance_threshold": -0.0004242613266607397, + "compression/movement_sparsity/linear_layer_sparsity": 0.8879777513156649, + "compression/movement_sparsity/model_sparsity": 0.8574729838188877, + "compression_loss": 99.9189453125, + "distillation_loss": 5.155549049377441, + "epoch": 3.82, + "learning_rate": 3.4314830468676626e-05, + "loss": 104.9395, + "step": 4523, + "task_loss": 2.3619422912597656 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.93955373789804, + "compression/movement_sparsity/importance_threshold": -0.0004233495905976237, + "compression/movement_sparsity/linear_layer_sparsity": 0.8880392561723307, + "compression/movement_sparsity/model_sparsity": 0.8575323757945159, + "compression_loss": 99.93220520019531, + "distillation_loss": 4.723738670349121, + "epoch": 3.82, + "learning_rate": 3.431013431013431e-05, + "loss": 104.3734, + "step": 4524, + "task_loss": 3.218928575515747 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9396837298149692, + "compression/movement_sparsity/importance_threshold": -0.00042243916168276194, + "compression/movement_sparsity/linear_layer_sparsity": 0.8880883956671581, + "compression/movement_sparsity/model_sparsity": 0.8575798271965253, + "compression_loss": 99.94552612304688, + "distillation_loss": 4.045338153839111, + "epoch": 3.82, + "learning_rate": 3.4305438151592e-05, + "loss": 104.1328, + "step": 4525, + "task_loss": 2.146289825439453 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9398135352298975, + "compression/movement_sparsity/importance_threshold": -0.00042153003897845657, + "compression/movement_sparsity/linear_layer_sparsity": 0.8881129475283204, + "compression/movement_sparsity/model_sparsity": 0.8576035356257262, + "compression_loss": 99.958740234375, + "distillation_loss": 4.084800720214844, + "epoch": 3.83, + "learning_rate": 3.4300741993049685e-05, + "loss": 104.1845, + "step": 4526, + "task_loss": 2.2716400623321533 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9399431542767102, + "compression/movement_sparsity/importance_threshold": -0.00042062222154701234, + "compression/movement_sparsity/linear_layer_sparsity": 0.8881531796699238, + "compression/movement_sparsity/model_sparsity": 0.857642385669497, + "compression_loss": 99.97191619873047, + "distillation_loss": 4.517703056335449, + "epoch": 3.83, + "learning_rate": 3.429604583450737e-05, + "loss": 103.9249, + "step": 4527, + "task_loss": 2.561556339263916 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9400725870892921, + "compression/movement_sparsity/importance_threshold": -0.0004197157084507349, + "compression/movement_sparsity/linear_layer_sparsity": 0.8881672740360693, + "compression/movement_sparsity/model_sparsity": 0.8576559958508061, + "compression_loss": 99.98516845703125, + "distillation_loss": 2.7468767166137695, + "epoch": 3.83, + "learning_rate": 3.4291349675965064e-05, + "loss": 103.6224, + "step": 4528, + "task_loss": 2.8449552059173584 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9402018338015287, + "compression/movement_sparsity/importance_threshold": -0.0004188104987519264, + "compression/movement_sparsity/linear_layer_sparsity": 0.8882513394179022, + "compression/movement_sparsity/model_sparsity": 0.8577371733281576, + "compression_loss": 99.9983139038086, + "distillation_loss": 5.504179000854492, + "epoch": 3.83, + "learning_rate": 3.428665351742275e-05, + "loss": 104.7667, + "step": 4529, + "task_loss": 2.6728708744049072 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9403308945473048, + "compression/movement_sparsity/importance_threshold": -0.0004179065915128916, + "compression/movement_sparsity/linear_layer_sparsity": 0.8882881254750589, + "compression/movement_sparsity/model_sparsity": 0.8577726956710838, + "compression_loss": 100.011474609375, + "distillation_loss": 3.343381404876709, + "epoch": 3.83, + "learning_rate": 3.428195735888044e-05, + "loss": 103.4983, + "step": 4530, + "task_loss": 2.350043773651123 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9404597694605058, + "compression/movement_sparsity/importance_threshold": -0.0004170039857959336, + "compression/movement_sparsity/linear_layer_sparsity": 0.888313500103788, + "compression/movement_sparsity/model_sparsity": 0.8577971986032545, + "compression_loss": 100.02455139160156, + "distillation_loss": 5.592968940734863, + "epoch": 3.83, + "learning_rate": 3.427726120033812e-05, + "loss": 104.6841, + "step": 4531, + "task_loss": 3.4871695041656494 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9405884586750167, + "compression/movement_sparsity/importance_threshold": -0.00041610268066335793, + "compression/movement_sparsity/linear_layer_sparsity": 0.8883812889967978, + "compression/movement_sparsity/model_sparsity": 0.8578626587392465, + "compression_loss": 100.03768157958984, + "distillation_loss": 4.361526012420654, + "epoch": 3.83, + "learning_rate": 3.427256504179581e-05, + "loss": 104.7224, + "step": 4532, + "task_loss": 1.7217413187026978 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9407169623247228, + "compression/movement_sparsity/importance_threshold": -0.0004152026751774668, + "compression/movement_sparsity/linear_layer_sparsity": 0.8884393119965139, + "compression/movement_sparsity/model_sparsity": 0.8579186884704227, + "compression_loss": 100.05084228515625, + "distillation_loss": 4.935878276824951, + "epoch": 3.83, + "learning_rate": 3.42678688832535e-05, + "loss": 104.6758, + "step": 4533, + "task_loss": 1.5802088975906372 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9408452805435091, + "compression/movement_sparsity/importance_threshold": -0.000414303968400565, + "compression/movement_sparsity/linear_layer_sparsity": 0.8885264895860997, + "compression/movement_sparsity/model_sparsity": 0.8580028712416167, + "compression_loss": 100.0638656616211, + "distillation_loss": 3.853682041168213, + "epoch": 3.83, + "learning_rate": 3.426317272471119e-05, + "loss": 104.6535, + "step": 4534, + "task_loss": 2.6037731170654297 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9409734134652609, + "compression/movement_sparsity/importance_threshold": -0.0004134065593949564, + "compression/movement_sparsity/linear_layer_sparsity": 0.8886348921940773, + "compression/movement_sparsity/model_sparsity": 0.8581075498865248, + "compression_loss": 100.07693481445312, + "distillation_loss": 5.714635848999023, + "epoch": 3.83, + "learning_rate": 3.4258476566168875e-05, + "loss": 104.8832, + "step": 4535, + "task_loss": 3.432044744491577 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9411013612238631, + "compression/movement_sparsity/importance_threshold": -0.0004125104472229449, + "compression/movement_sparsity/linear_layer_sparsity": 0.8887242757546758, + "compression/movement_sparsity/model_sparsity": 0.8581938628468407, + "compression_loss": 100.08992767333984, + "distillation_loss": 5.693085670471191, + "epoch": 3.83, + "learning_rate": 3.425378040762656e-05, + "loss": 103.8307, + "step": 4536, + "task_loss": 2.228610038757324 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.941229123953201, + "compression/movement_sparsity/importance_threshold": -0.0004116156309468353, + "compression/movement_sparsity/linear_layer_sparsity": 0.8887757881588627, + "compression/movement_sparsity/model_sparsity": 0.8582436056414732, + "compression_loss": 100.10295867919922, + "distillation_loss": 5.111946105957031, + "epoch": 3.83, + "learning_rate": 3.424908424908425e-05, + "loss": 105.0145, + "step": 4537, + "task_loss": 1.9150798320770264 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9413567017871597, + "compression/movement_sparsity/importance_threshold": -0.0004107221096289306, + "compression/movement_sparsity/linear_layer_sparsity": 0.8888842980843491, + "compression/movement_sparsity/model_sparsity": 0.8583483879172035, + "compression_loss": 100.1159439086914, + "distillation_loss": 4.226686954498291, + "epoch": 3.84, + "learning_rate": 3.424438809054194e-05, + "loss": 104.7049, + "step": 4538, + "task_loss": 2.518601894378662 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9414840948596243, + "compression/movement_sparsity/importance_threshold": -0.0004098298823315364, + "compression/movement_sparsity/linear_layer_sparsity": 0.8889090884288641, + "compression/movement_sparsity/model_sparsity": 0.8583723266371203, + "compression_loss": 100.12889099121094, + "distillation_loss": 4.587031841278076, + "epoch": 3.84, + "learning_rate": 3.423969193199962e-05, + "loss": 104.2651, + "step": 4539, + "task_loss": 2.4530508518218994 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9416113033044801, + "compression/movement_sparsity/importance_threshold": -0.0004089389481169532, + "compression/movement_sparsity/linear_layer_sparsity": 0.8890357946341627, + "compression/movement_sparsity/model_sparsity": 0.858494680094473, + "compression_loss": 100.14183044433594, + "distillation_loss": 4.548207759857178, + "epoch": 3.84, + "learning_rate": 3.4234995773457314e-05, + "loss": 104.1211, + "step": 4540, + "task_loss": 2.712223529815674 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.941738327255612, + "compression/movement_sparsity/importance_threshold": -0.0004080493060474883, + "compression/movement_sparsity/linear_layer_sparsity": 0.889160485655131, + "compression/movement_sparsity/model_sparsity": 0.8586150875952766, + "compression_loss": 100.15477752685547, + "distillation_loss": 4.381241798400879, + "epoch": 3.84, + "learning_rate": 3.4230299614915e-05, + "loss": 104.1161, + "step": 4541, + "task_loss": 2.9729394912719727 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9418651668469054, + "compression/movement_sparsity/importance_threshold": -0.0004071609551854448, + "compression/movement_sparsity/linear_layer_sparsity": 0.8892655614203382, + "compression/movement_sparsity/model_sparsity": 0.8587165536846981, + "compression_loss": 100.16768646240234, + "distillation_loss": 4.094034194946289, + "epoch": 3.84, + "learning_rate": 3.422560345637269e-05, + "loss": 104.069, + "step": 4542, + "task_loss": 1.6684879064559937 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9419918222122452, + "compression/movement_sparsity/importance_threshold": -0.00040627389459312654, + "compression/movement_sparsity/linear_layer_sparsity": 0.8893226901074816, + "compression/movement_sparsity/model_sparsity": 0.8587717198256898, + "compression_loss": 100.18064880371094, + "distillation_loss": 5.238960266113281, + "epoch": 3.84, + "learning_rate": 3.422090729783038e-05, + "loss": 104.8357, + "step": 4543, + "task_loss": 3.900768756866455 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9421182934855167, + "compression/movement_sparsity/importance_threshold": -0.0004053881233328366, + "compression/movement_sparsity/linear_layer_sparsity": 0.8893198879280871, + "compression/movement_sparsity/model_sparsity": 0.858769013909778, + "compression_loss": 100.19351959228516, + "distillation_loss": 5.322239875793457, + "epoch": 3.84, + "learning_rate": 3.421621113928806e-05, + "loss": 105.3242, + "step": 4544, + "task_loss": 3.2929630279541016 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9422445808006049, + "compression/movement_sparsity/importance_threshold": -0.00040450364046688055, + "compression/movement_sparsity/linear_layer_sparsity": 0.8893775174302713, + "compression/movement_sparsity/model_sparsity": 0.8588246636612731, + "compression_loss": 100.20641326904297, + "distillation_loss": 4.867955207824707, + "epoch": 3.84, + "learning_rate": 3.421151498074575e-05, + "loss": 105.1653, + "step": 4545, + "task_loss": 1.9060171842575073 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9423706842913951, + "compression/movement_sparsity/importance_threshold": -0.0004036204450575606, + "compression/movement_sparsity/linear_layer_sparsity": 0.8894493247677745, + "compression/movement_sparsity/model_sparsity": 0.8588940041958277, + "compression_loss": 100.21927642822266, + "distillation_loss": 4.625563621520996, + "epoch": 3.84, + "learning_rate": 3.420681882220344e-05, + "loss": 105.438, + "step": 4546, + "task_loss": 3.3087544441223145 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9424966040917724, + "compression/movement_sparsity/importance_threshold": -0.0004027385361671833, + "compression/movement_sparsity/linear_layer_sparsity": 0.8895555452530747, + "compression/movement_sparsity/model_sparsity": 0.8589965756806855, + "compression_loss": 100.2320785522461, + "distillation_loss": 7.1838884353637695, + "epoch": 3.84, + "learning_rate": 3.420212266366113e-05, + "loss": 105.4651, + "step": 4547, + "task_loss": 3.5911319255828857 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9426223403356219, + "compression/movement_sparsity/importance_threshold": -0.0004018579128580498, + "compression/movement_sparsity/linear_layer_sparsity": 0.889582648886111, + "compression/movement_sparsity/model_sparsity": 0.8590227482205465, + "compression_loss": 100.24492645263672, + "distillation_loss": 5.1949615478515625, + "epoch": 3.84, + "learning_rate": 3.419742650511881e-05, + "loss": 104.4174, + "step": 4548, + "task_loss": 3.3867945671081543 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9427478931568287, + "compression/movement_sparsity/importance_threshold": -0.00040097857419246503, + "compression/movement_sparsity/linear_layer_sparsity": 0.8897170819520378, + "compression/movement_sparsity/model_sparsity": 0.8591525630970942, + "compression_loss": 100.25775146484375, + "distillation_loss": 3.4227688312530518, + "epoch": 3.84, + "learning_rate": 3.4192730346576504e-05, + "loss": 104.3519, + "step": 4549, + "task_loss": 2.013906717300415 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9428732626892781, + "compression/movement_sparsity/importance_threshold": -0.0004001005192327345, + "compression/movement_sparsity/linear_layer_sparsity": 0.88976488594009, + "compression/movement_sparsity/model_sparsity": 0.8591987248710945, + "compression_loss": 100.27051544189453, + "distillation_loss": 6.102532386779785, + "epoch": 3.85, + "learning_rate": 3.418803418803419e-05, + "loss": 104.9936, + "step": 4550, + "task_loss": 2.461862564086914 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9429984490668549, + "compression/movement_sparsity/importance_threshold": -0.00039922374704116047, + "compression/movement_sparsity/linear_layer_sparsity": 0.8898026974756632, + "compression/movement_sparsity/model_sparsity": 0.859235237464099, + "compression_loss": 100.28326416015625, + "distillation_loss": 3.8065192699432373, + "epoch": 3.85, + "learning_rate": 3.418333802949188e-05, + "loss": 104.7014, + "step": 4551, + "task_loss": 2.004796266555786 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9431234524234448, + "compression/movement_sparsity/importance_threshold": -0.00039834825668004677, + "compression/movement_sparsity/linear_layer_sparsity": 0.8899704705142999, + "compression/movement_sparsity/model_sparsity": 0.8593972469827281, + "compression_loss": 100.29605865478516, + "distillation_loss": 3.8205912113189697, + "epoch": 3.85, + "learning_rate": 3.4178641870949563e-05, + "loss": 104.9145, + "step": 4552, + "task_loss": 2.087481737136841 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9432482728929324, + "compression/movement_sparsity/importance_threshold": -0.0003974740472116982, + "compression/movement_sparsity/linear_layer_sparsity": 0.8899590113892017, + "compression/movement_sparsity/model_sparsity": 0.8593861815138296, + "compression_loss": 100.3088150024414, + "distillation_loss": 3.8403329849243164, + "epoch": 3.85, + "learning_rate": 3.417394571240725e-05, + "loss": 104.9877, + "step": 4553, + "task_loss": 1.9831923246383667 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.943372910609203, + "compression/movement_sparsity/importance_threshold": -0.0003966011176984195, + "compression/movement_sparsity/linear_layer_sparsity": 0.8899558753331136, + "compression/movement_sparsity/model_sparsity": 0.8593831531909156, + "compression_loss": 100.32152557373047, + "distillation_loss": 5.918248653411865, + "epoch": 3.85, + "learning_rate": 3.416924955386494e-05, + "loss": 104.7306, + "step": 4554, + "task_loss": 2.389813184738159 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.943497365706142, + "compression/movement_sparsity/importance_threshold": -0.00039572946720251283, + "compression/movement_sparsity/linear_layer_sparsity": 0.8900250355054016, + "compression/movement_sparsity/model_sparsity": 0.8594499374985239, + "compression_loss": 100.3342056274414, + "distillation_loss": 4.823528289794922, + "epoch": 3.85, + "learning_rate": 3.416455339532263e-05, + "loss": 104.8423, + "step": 4555, + "task_loss": 3.9171040058135986 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9436216383176342, + "compression/movement_sparsity/importance_threshold": -0.000394859094786283, + "compression/movement_sparsity/linear_layer_sparsity": 0.8901544604209212, + "compression/movement_sparsity/model_sparsity": 0.8595749162700379, + "compression_loss": 100.34680938720703, + "distillation_loss": 4.388155460357666, + "epoch": 3.85, + "learning_rate": 3.4159857236780316e-05, + "loss": 104.1821, + "step": 4556, + "task_loss": 2.3760643005371094 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9437457285775649, + "compression/movement_sparsity/importance_threshold": -0.0003939899995120339, + "compression/movement_sparsity/linear_layer_sparsity": 0.8902404217454082, + "compression/movement_sparsity/model_sparsity": 0.8596579245585808, + "compression_loss": 100.35943603515625, + "distillation_loss": 5.23970890045166, + "epoch": 3.85, + "learning_rate": 3.4155161078238e-05, + "loss": 104.6851, + "step": 4557, + "task_loss": 2.7074804306030273 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9438696366198193, + "compression/movement_sparsity/importance_threshold": -0.00039312218044207023, + "compression/movement_sparsity/linear_layer_sparsity": 0.8902626841663843, + "compression/movement_sparsity/model_sparsity": 0.8596794221969092, + "compression_loss": 100.3719711303711, + "distillation_loss": 3.5079846382141113, + "epoch": 3.85, + "learning_rate": 3.415046491969569e-05, + "loss": 105.0793, + "step": 4558, + "task_loss": 2.076284885406494 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9439933625782824, + "compression/movement_sparsity/importance_threshold": -0.00039225563663869423, + "compression/movement_sparsity/linear_layer_sparsity": 0.8903582205974828, + "compression/movement_sparsity/model_sparsity": 0.859771676657695, + "compression_loss": 100.38452911376953, + "distillation_loss": 5.416142463684082, + "epoch": 3.85, + "learning_rate": 3.414576876115338e-05, + "loss": 105.6541, + "step": 4559, + "task_loss": 2.8598122596740723 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9441169065868393, + "compression/movement_sparsity/importance_threshold": -0.00039139036716421237, + "compression/movement_sparsity/linear_layer_sparsity": 0.8903723507361313, + "compression/movement_sparsity/model_sparsity": 0.8597853213826115, + "compression_loss": 100.39707946777344, + "distillation_loss": 5.128992080688477, + "epoch": 3.85, + "learning_rate": 3.414107260261107e-05, + "loss": 105.5694, + "step": 4560, + "task_loss": 2.923676013946533 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9442402687793753, + "compression/movement_sparsity/importance_threshold": -0.0003905263710809277, + "compression/movement_sparsity/linear_layer_sparsity": 0.8904547348103276, + "compression/movement_sparsity/model_sparsity": 0.859864875310416, + "compression_loss": 100.4096450805664, + "distillation_loss": 2.975842237472534, + "epoch": 3.85, + "learning_rate": 3.4136376444068754e-05, + "loss": 103.6428, + "step": 4561, + "task_loss": 1.151943325996399 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9443634492897756, + "compression/movement_sparsity/importance_threshold": -0.0003896636474511415, + "compression/movement_sparsity/linear_layer_sparsity": 0.8905473021236847, + "compression/movement_sparsity/model_sparsity": 0.8599542626517891, + "compression_loss": 100.422119140625, + "distillation_loss": 5.097267150878906, + "epoch": 3.86, + "learning_rate": 3.413168028552644e-05, + "loss": 104.6303, + "step": 4562, + "task_loss": 3.3521292209625244 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9444864482519251, + "compression/movement_sparsity/importance_threshold": -0.00038880219533716197, + "compression/movement_sparsity/linear_layer_sparsity": 0.8905874150236117, + "compression/movement_sparsity/model_sparsity": 0.8599929975502019, + "compression_loss": 100.4345703125, + "distillation_loss": 4.044228553771973, + "epoch": 3.86, + "learning_rate": 3.412698412698413e-05, + "loss": 103.9783, + "step": 4563, + "task_loss": 1.4565430879592896 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.944609265799709, + "compression/movement_sparsity/importance_threshold": -0.00038794201380129047, + "compression/movement_sparsity/linear_layer_sparsity": 0.8907103651161051, + "compression/movement_sparsity/model_sparsity": 0.8601117239287794, + "compression_loss": 100.44706726074219, + "distillation_loss": 5.892083644866943, + "epoch": 3.86, + "learning_rate": 3.412228796844182e-05, + "loss": 105.0898, + "step": 4564, + "task_loss": 3.5995705127716064 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9447319020670126, + "compression/movement_sparsity/importance_threshold": -0.00038708310190583173, + "compression/movement_sparsity/linear_layer_sparsity": 0.8907812185201974, + "compression/movement_sparsity/model_sparsity": 0.8601801433004705, + "compression_loss": 100.45948028564453, + "distillation_loss": 4.049729347229004, + "epoch": 3.86, + "learning_rate": 3.41175918098995e-05, + "loss": 105.1806, + "step": 4565, + "task_loss": 0.9365973472595215 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9448543571877209, + "compression/movement_sparsity/importance_threshold": -0.00038622545871308967, + "compression/movement_sparsity/linear_layer_sparsity": 0.890756249313168, + "compression/movement_sparsity/model_sparsity": 0.8601560318625168, + "compression_loss": 100.4719009399414, + "distillation_loss": 5.677170276641846, + "epoch": 3.86, + "learning_rate": 3.411289565135719e-05, + "loss": 104.8938, + "step": 4566, + "task_loss": 3.0070064067840576 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9449766312957192, + "compression/movement_sparsity/importance_threshold": -0.0003853690832853682, + "compression/movement_sparsity/linear_layer_sparsity": 0.8908949512311083, + "compression/movement_sparsity/model_sparsity": 0.8602899689428789, + "compression_loss": 100.48423767089844, + "distillation_loss": 4.856156349182129, + "epoch": 3.86, + "learning_rate": 3.410819949281488e-05, + "loss": 105.0563, + "step": 4567, + "task_loss": 3.40464448928833 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9450987245248924, + "compression/movement_sparsity/importance_threshold": -0.00038451397468497114, + "compression/movement_sparsity/linear_layer_sparsity": 0.8909962589593426, + "compression/movement_sparsity/model_sparsity": 0.8603877964389893, + "compression_loss": 100.49665069580078, + "distillation_loss": 4.184131622314453, + "epoch": 3.86, + "learning_rate": 3.4103503334272565e-05, + "loss": 104.89, + "step": 4568, + "task_loss": 2.2326037883758545 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9452206370091258, + "compression/movement_sparsity/importance_threshold": -0.0003836601319742025, + "compression/movement_sparsity/linear_layer_sparsity": 0.8910834365489284, + "compression/movement_sparsity/model_sparsity": 0.8604719792101833, + "compression_loss": 100.5090103149414, + "distillation_loss": 3.6449947357177734, + "epoch": 3.86, + "learning_rate": 3.409880717573025e-05, + "loss": 104.3568, + "step": 4569, + "task_loss": 0.9449997544288635 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9453423688823045, + "compression/movement_sparsity/importance_threshold": -0.00038280755421536694, + "compression/movement_sparsity/linear_layer_sparsity": 0.8911646043580257, + "compression/movement_sparsity/model_sparsity": 0.8605503586553367, + "compression_loss": 100.52131652832031, + "distillation_loss": 4.239992141723633, + "epoch": 3.86, + "learning_rate": 3.409411101718794e-05, + "loss": 104.8974, + "step": 4570, + "task_loss": 2.364339590072632 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9454639202783137, + "compression/movement_sparsity/importance_threshold": -0.00038195624047076757, + "compression/movement_sparsity/linear_layer_sparsity": 0.8912603673483094, + "compression/movement_sparsity/model_sparsity": 0.8606428318923026, + "compression_loss": 100.53364562988281, + "distillation_loss": 5.223026752471924, + "epoch": 3.86, + "learning_rate": 3.408941485864563e-05, + "loss": 105.3514, + "step": 4571, + "task_loss": 3.9417033195495605 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9455852913310384, + "compression/movement_sparsity/importance_threshold": -0.0003811061898027091, + "compression/movement_sparsity/linear_layer_sparsity": 0.8914089663253875, + "compression/movement_sparsity/model_sparsity": 0.8607863260373741, + "compression_loss": 100.54591369628906, + "distillation_loss": 5.254037380218506, + "epoch": 3.86, + "learning_rate": 3.408471870010332e-05, + "loss": 105.6839, + "step": 4572, + "task_loss": 2.153132677078247 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9457064821743638, + "compression/movement_sparsity/importance_threshold": -0.0003802574012734955, + "compression/movement_sparsity/linear_layer_sparsity": 0.8913756740493481, + "compression/movement_sparsity/model_sparsity": 0.8607541774534359, + "compression_loss": 100.55821228027344, + "distillation_loss": 4.223822593688965, + "epoch": 3.87, + "learning_rate": 3.408002254156101e-05, + "loss": 105.297, + "step": 4573, + "task_loss": 2.9718286991119385 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9458274929421752, + "compression/movement_sparsity/importance_threshold": -0.00037940987394542974, + "compression/movement_sparsity/linear_layer_sparsity": 0.8914539442857099, + "compression/movement_sparsity/model_sparsity": 0.8608297588663912, + "compression_loss": 100.5704574584961, + "distillation_loss": 3.7230608463287354, + "epoch": 3.87, + "learning_rate": 3.407532638301869e-05, + "loss": 105.0828, + "step": 4574, + "task_loss": 1.6224454641342163 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9459483237683575, + "compression/movement_sparsity/importance_threshold": -0.0003785636068808166, + "compression/movement_sparsity/linear_layer_sparsity": 0.8914448938424743, + "compression/movement_sparsity/model_sparsity": 0.8608210193337231, + "compression_loss": 100.582763671875, + "distillation_loss": 4.792668342590332, + "epoch": 3.87, + "learning_rate": 3.407063022447638e-05, + "loss": 104.9859, + "step": 4575, + "task_loss": 2.8506128787994385 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.946068974786796, + "compression/movement_sparsity/importance_threshold": -0.00037771859914196, + "compression/movement_sparsity/linear_layer_sparsity": 0.8914831823447531, + "compression/movement_sparsity/model_sparsity": 0.8608579925081594, + "compression_loss": 100.59502410888672, + "distillation_loss": 4.605414390563965, + "epoch": 3.87, + "learning_rate": 3.406593406593407e-05, + "loss": 105.2027, + "step": 4576, + "task_loss": 2.140082836151123 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9461894461313758, + "compression/movement_sparsity/importance_threshold": -0.0003768748497911629, + "compression/movement_sparsity/linear_layer_sparsity": 0.8915669376982274, + "compression/movement_sparsity/model_sparsity": 0.8609388706075802, + "compression_loss": 100.60725402832031, + "distillation_loss": 4.68811559677124, + "epoch": 3.87, + "learning_rate": 3.4061237907391756e-05, + "loss": 104.7396, + "step": 4577, + "task_loss": 3.1756834983825684 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.946309737935982, + "compression/movement_sparsity/importance_threshold": -0.00037603235789073104, + "compression/movement_sparsity/linear_layer_sparsity": 0.891624483731238, + "compression/movement_sparsity/model_sparsity": 0.8609944397573247, + "compression_loss": 100.61952209472656, + "distillation_loss": 4.796651840209961, + "epoch": 3.87, + "learning_rate": 3.405654174884944e-05, + "loss": 105.1258, + "step": 4578, + "task_loss": 2.046924352645874 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9464298503344998, + "compression/movement_sparsity/importance_threshold": -0.0003751911225029674, + "compression/movement_sparsity/linear_layer_sparsity": 0.8915847404805077, + "compression/movement_sparsity/model_sparsity": 0.8609560618095216, + "compression_loss": 100.63166046142578, + "distillation_loss": 3.3481109142303467, + "epoch": 3.87, + "learning_rate": 3.405184559030713e-05, + "loss": 105.0531, + "step": 4579, + "task_loss": 1.4707443714141846 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9465497834608143, + "compression/movement_sparsity/importance_threshold": -0.00037435114269017584, + "compression/movement_sparsity/linear_layer_sparsity": 0.8916884330422692, + "compression/movement_sparsity/model_sparsity": 0.8610561922127908, + "compression_loss": 100.643798828125, + "distillation_loss": 4.046797275543213, + "epoch": 3.87, + "learning_rate": 3.404714943176482e-05, + "loss": 105.0061, + "step": 4580, + "task_loss": 1.945582389831543 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9466695374488105, + "compression/movement_sparsity/importance_threshold": -0.00037351241751466206, + "compression/movement_sparsity/linear_layer_sparsity": 0.8917565438878052, + "compression/movement_sparsity/model_sparsity": 0.8611219632412492, + "compression_loss": 100.65595245361328, + "distillation_loss": 6.010524272918701, + "epoch": 3.87, + "learning_rate": 3.404245327322251e-05, + "loss": 105.0814, + "step": 4581, + "task_loss": 3.0034704208374023 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.946789112432374, + "compression/movement_sparsity/importance_threshold": -0.00037267494603872645, + "compression/movement_sparsity/linear_layer_sparsity": 0.8919112361145453, + "compression/movement_sparsity/model_sparsity": 0.8612713413141119, + "compression_loss": 100.66813659667969, + "distillation_loss": 4.315887451171875, + "epoch": 3.87, + "learning_rate": 3.4037757114680194e-05, + "loss": 105.2493, + "step": 4582, + "task_loss": 2.8687756061553955 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9469085085453894, + "compression/movement_sparsity/importance_threshold": -0.0003718387273246764, + "compression/movement_sparsity/linear_layer_sparsity": 0.891950836275264, + "compression/movement_sparsity/model_sparsity": 0.8613095810874855, + "compression_loss": 100.68025207519531, + "distillation_loss": 5.229598522186279, + "epoch": 3.87, + "learning_rate": 3.403306095613788e-05, + "loss": 105.2484, + "step": 4583, + "task_loss": 2.2565793991088867 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9470277259217422, + "compression/movement_sparsity/importance_threshold": -0.00037100376043481403, + "compression/movement_sparsity/linear_layer_sparsity": 0.8920489125540689, + "compression/movement_sparsity/model_sparsity": 0.8614042881443956, + "compression_loss": 100.69231414794922, + "distillation_loss": 3.4245283603668213, + "epoch": 3.87, + "learning_rate": 3.402836479759557e-05, + "loss": 105.0676, + "step": 4584, + "task_loss": 2.0414838790893555 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9471467646953173, + "compression/movement_sparsity/importance_threshold": -0.00037017004443144503, + "compression/movement_sparsity/linear_layer_sparsity": 0.8921379980104764, + "compression/movement_sparsity/model_sparsity": 0.8614903132413166, + "compression_loss": 100.70439147949219, + "distillation_loss": 4.959332466125488, + "epoch": 3.88, + "learning_rate": 3.402366863905326e-05, + "loss": 105.5242, + "step": 4585, + "task_loss": 3.6372737884521484 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.947265625, + "compression/movement_sparsity/importance_threshold": -0.00036933757837687153, + "compression/movement_sparsity/linear_layer_sparsity": 0.8922087560212277, + "compression/movement_sparsity/model_sparsity": 0.8615586404967214, + "compression_loss": 100.71643829345703, + "distillation_loss": 5.04964542388916, + "epoch": 3.88, + "learning_rate": 3.401897248051094e-05, + "loss": 104.8315, + "step": 4586, + "task_loss": 2.7517597675323486 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9473843069696755, + "compression/movement_sparsity/importance_threshold": -0.00036850636133339745, + "compression/movement_sparsity/linear_layer_sparsity": 0.892278595871071, + "compression/movement_sparsity/model_sparsity": 0.8616260811328701, + "compression_loss": 100.72847747802734, + "distillation_loss": 3.693368911743164, + "epoch": 3.88, + "learning_rate": 3.401427632196863e-05, + "loss": 105.0103, + "step": 4587, + "task_loss": 1.712372899055481 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9475028107382286, + "compression/movement_sparsity/importance_threshold": -0.00036767639236332927, + "compression/movement_sparsity/linear_layer_sparsity": 0.8922988908043872, + "compression/movement_sparsity/model_sparsity": 0.8616456788727923, + "compression_loss": 100.74050903320312, + "distillation_loss": 5.214860916137695, + "epoch": 3.88, + "learning_rate": 3.400958016342632e-05, + "loss": 105.4842, + "step": 4588, + "task_loss": 2.8640553951263428 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9476211364395448, + "compression/movement_sparsity/importance_threshold": -0.0003668476705289683, + "compression/movement_sparsity/linear_layer_sparsity": 0.8923699588434969, + "compression/movement_sparsity/model_sparsity": 0.8617143055061278, + "compression_loss": 100.75250244140625, + "distillation_loss": 5.294858932495117, + "epoch": 3.88, + "learning_rate": 3.4004884004884005e-05, + "loss": 106.757, + "step": 4589, + "task_loss": 3.3166391849517822 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9477392842075092, + "compression/movement_sparsity/importance_threshold": -0.0003660201948926193, + "compression/movement_sparsity/linear_layer_sparsity": 0.8924523190693578, + "compression/movement_sparsity/model_sparsity": 0.8617938364048607, + "compression_loss": 100.76454162597656, + "distillation_loss": 4.4140119552612305, + "epoch": 3.88, + "learning_rate": 3.40001878463417e-05, + "loss": 105.0569, + "step": 4590, + "task_loss": 2.84916615486145 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9478572541760069, + "compression/movement_sparsity/importance_threshold": -0.00036519396451658616, + "compression/movement_sparsity/linear_layer_sparsity": 0.8924921457892616, + "compression/movement_sparsity/model_sparsity": 0.8618322949544145, + "compression_loss": 100.7764892578125, + "distillation_loss": 4.313340187072754, + "epoch": 3.88, + "learning_rate": 3.399549168779938e-05, + "loss": 105.0468, + "step": 4591, + "task_loss": 3.1176624298095703 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9479750464789227, + "compression/movement_sparsity/importance_threshold": -0.00036436897846317453, + "compression/movement_sparsity/linear_layer_sparsity": 0.8925161968353832, + "compression/movement_sparsity/model_sparsity": 0.861855519773112, + "compression_loss": 100.78843688964844, + "distillation_loss": 6.2348222732543945, + "epoch": 3.88, + "learning_rate": 3.399079552925707e-05, + "loss": 105.7034, + "step": 4592, + "task_loss": 3.5231194496154785 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9480926612501424, + "compression/movement_sparsity/importance_threshold": -0.0003635452357946857, + "compression/movement_sparsity/linear_layer_sparsity": 0.8926229181357241, + "compression/movement_sparsity/model_sparsity": 0.8619585748684732, + "compression_loss": 100.8003921508789, + "distillation_loss": 4.526049613952637, + "epoch": 3.88, + "learning_rate": 3.398609937071476e-05, + "loss": 104.7618, + "step": 4593, + "task_loss": 2.387983560562134 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9482100986235505, + "compression/movement_sparsity/importance_threshold": -0.0003627227355734253, + "compression/movement_sparsity/linear_layer_sparsity": 0.8926213322214286, + "compression/movement_sparsity/model_sparsity": 0.8619570434352125, + "compression_loss": 100.81232452392578, + "distillation_loss": 5.399662017822266, + "epoch": 3.88, + "learning_rate": 3.3981403212172444e-05, + "loss": 104.5675, + "step": 4594, + "task_loss": 3.5217273235321045 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9483273587330325, + "compression/movement_sparsity/importance_threshold": -0.00036190147686169724, + "compression/movement_sparsity/linear_layer_sparsity": 0.8926457052200762, + "compression/movement_sparsity/model_sparsity": 0.8619805791463766, + "compression_loss": 100.82414245605469, + "distillation_loss": 3.3795714378356934, + "epoch": 3.88, + "learning_rate": 3.397670705363013e-05, + "loss": 104.9265, + "step": 4595, + "task_loss": 1.6177798509597778 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9484444417124736, + "compression/movement_sparsity/importance_threshold": -0.00036108145872180454, + "compression/movement_sparsity/linear_layer_sparsity": 0.8927784092816958, + "compression/movement_sparsity/model_sparsity": 0.862108724415234, + "compression_loss": 100.8360366821289, + "distillation_loss": 4.153824329376221, + "epoch": 3.88, + "learning_rate": 3.3972010895087817e-05, + "loss": 104.7519, + "step": 4596, + "task_loss": 3.192978858947754 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9485613476957586, + "compression/movement_sparsity/importance_threshold": -0.00036026268021605284, + "compression/movement_sparsity/linear_layer_sparsity": 0.8929154179678318, + "compression/movement_sparsity/model_sparsity": 0.8622410264315132, + "compression_loss": 100.84784698486328, + "distillation_loss": 4.537819862365723, + "epoch": 3.89, + "learning_rate": 3.396731473654551e-05, + "loss": 105.7919, + "step": 4597, + "task_loss": 2.135457992553711 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.948678076816773, + "compression/movement_sparsity/importance_threshold": -0.0003594451404067443, + "compression/movement_sparsity/linear_layer_sparsity": 0.8929555308677589, + "compression/movement_sparsity/model_sparsity": 0.8622797613299261, + "compression_loss": 100.85968017578125, + "distillation_loss": 4.291610240936279, + "epoch": 3.89, + "learning_rate": 3.3962618578003196e-05, + "loss": 105.1975, + "step": 4598, + "task_loss": 1.4691524505615234 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9487946292094017, + "compression/movement_sparsity/importance_threshold": -0.0003586288383561837, + "compression/movement_sparsity/linear_layer_sparsity": 0.8930258119118046, + "compression/movement_sparsity/model_sparsity": 0.8623476280038992, + "compression_loss": 100.87142944335938, + "distillation_loss": 4.002674579620361, + "epoch": 3.89, + "learning_rate": 3.395792241946088e-05, + "loss": 104.5335, + "step": 4599, + "task_loss": 1.9293248653411865 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9489110050075299, + "compression/movement_sparsity/importance_threshold": -0.00035781377312667575, + "compression/movement_sparsity/linear_layer_sparsity": 0.8931613420011538, + "compression/movement_sparsity/model_sparsity": 0.8624785022177399, + "compression_loss": 100.88325500488281, + "distillation_loss": 5.997797966003418, + "epoch": 3.89, + "learning_rate": 3.395322626091857e-05, + "loss": 106.0315, + "step": 4600, + "task_loss": 2.6881532669067383 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9490272043450427, + "compression/movement_sparsity/importance_threshold": -0.00035699994378052356, + "compression/movement_sparsity/linear_layer_sparsity": 0.8932217856069, + "compression/movement_sparsity/model_sparsity": 0.8625368693996824, + "compression_loss": 100.89498901367188, + "distillation_loss": 4.720440864562988, + "epoch": 3.89, + "learning_rate": 3.3948530102376255e-05, + "loss": 105.6037, + "step": 4601, + "task_loss": 3.422818422317505 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9491432273558256, + "compression/movement_sparsity/importance_threshold": -0.0003561873493800301, + "compression/movement_sparsity/linear_layer_sparsity": 0.8932982433697813, + "compression/movement_sparsity/model_sparsity": 0.862610700603197, + "compression_loss": 100.90672302246094, + "distillation_loss": 3.991898536682129, + "epoch": 3.89, + "learning_rate": 3.394383394383395e-05, + "loss": 105.1803, + "step": 4602, + "task_loss": 2.160602569580078 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9492590741737632, + "compression/movement_sparsity/importance_threshold": -0.00035537598898750106, + "compression/movement_sparsity/linear_layer_sparsity": 0.8933128981718057, + "compression/movement_sparsity/model_sparsity": 0.8626248519676885, + "compression_loss": 100.91842651367188, + "distillation_loss": 3.972752332687378, + "epoch": 3.89, + "learning_rate": 3.393913778529163e-05, + "loss": 104.5996, + "step": 4603, + "task_loss": 1.4807106256484985 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9493747449327409, + "compression/movement_sparsity/importance_threshold": -0.00035456586166524116, + "compression/movement_sparsity/linear_layer_sparsity": 0.8934116302798306, + "compression/movement_sparsity/model_sparsity": 0.8627201923240673, + "compression_loss": 100.93008422851562, + "distillation_loss": 3.737037420272827, + "epoch": 3.89, + "learning_rate": 3.393444162674932e-05, + "loss": 105.2679, + "step": 4604, + "task_loss": 3.6903860569000244 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9494902397666438, + "compression/movement_sparsity/importance_threshold": -0.0003537569664755517, + "compression/movement_sparsity/linear_layer_sparsity": 0.8934162330085381, + "compression/movement_sparsity/model_sparsity": 0.862724636934884, + "compression_loss": 100.9417953491211, + "distillation_loss": 4.999066352844238, + "epoch": 3.89, + "learning_rate": 3.392974546820701e-05, + "loss": 105.469, + "step": 4605, + "task_loss": 2.1755032539367676 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9496055588093572, + "compression/movement_sparsity/importance_threshold": -0.00035294930248073834, + "compression/movement_sparsity/linear_layer_sparsity": 0.8934778571068802, + "compression/movement_sparsity/model_sparsity": 0.8627841440558702, + "compression_loss": 100.9533920288086, + "distillation_loss": 3.834989547729492, + "epoch": 3.89, + "learning_rate": 3.39250493096647e-05, + "loss": 105.0337, + "step": 4606, + "task_loss": 3.3596999645233154 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.949720702194766, + "compression/movement_sparsity/importance_threshold": -0.00035214286874310496, + "compression/movement_sparsity/linear_layer_sparsity": 0.8935577371058728, + "compression/movement_sparsity/model_sparsity": 0.8628612799311578, + "compression_loss": 100.96499633789062, + "distillation_loss": 6.507474899291992, + "epoch": 3.89, + "learning_rate": 3.392035315112239e-05, + "loss": 106.0033, + "step": 4607, + "task_loss": 4.008217811584473 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9498356700567555, + "compression/movement_sparsity/importance_threshold": -0.00035133766432495547, + "compression/movement_sparsity/linear_layer_sparsity": 0.8936075562782555, + "compression/movement_sparsity/model_sparsity": 0.8629093876617074, + "compression_loss": 100.9765853881836, + "distillation_loss": 4.785038948059082, + "epoch": 3.89, + "learning_rate": 3.3915656992580066e-05, + "loss": 104.9709, + "step": 4608, + "task_loss": 2.398371934890747 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9499504625292108, + "compression/movement_sparsity/importance_threshold": -0.0003505336882885929, + "compression/movement_sparsity/linear_layer_sparsity": 0.8936079497757875, + "compression/movement_sparsity/model_sparsity": 0.8629097676413886, + "compression_loss": 100.98819732666016, + "distillation_loss": 5.563446998596191, + "epoch": 3.9, + "learning_rate": 3.391096083403776e-05, + "loss": 105.7525, + "step": 4609, + "task_loss": 2.4296858310699463 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.950065079746017, + "compression/movement_sparsity/importance_threshold": -0.00034973093969632284, + "compression/movement_sparsity/linear_layer_sparsity": 0.8936861365429758, + "compression/movement_sparsity/model_sparsity": 0.8629852684525934, + "compression_loss": 100.99979400634766, + "distillation_loss": 3.281094551086426, + "epoch": 3.9, + "learning_rate": 3.3906264675495446e-05, + "loss": 105.1892, + "step": 4610, + "task_loss": 1.359919548034668 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9501795218410594, + "compression/movement_sparsity/importance_threshold": -0.0003489294176104475, + "compression/movement_sparsity/linear_layer_sparsity": 0.8937487264988965, + "compression/movement_sparsity/model_sparsity": 0.8630457082509789, + "compression_loss": 101.01136779785156, + "distillation_loss": 4.986927509307861, + "epoch": 3.9, + "learning_rate": 3.390156851695314e-05, + "loss": 105.0252, + "step": 4611, + "task_loss": 3.737544536590576 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9502937889482228, + "compression/movement_sparsity/importance_threshold": -0.0003481291210932734, + "compression/movement_sparsity/linear_layer_sparsity": 0.8938543626999826, + "compression/movement_sparsity/model_sparsity": 0.8631477155235828, + "compression_loss": 101.02296447753906, + "distillation_loss": 5.24118709564209, + "epoch": 3.9, + "learning_rate": 3.389687235841082e-05, + "loss": 105.4196, + "step": 4612, + "task_loss": 3.7000200748443604 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9504078812013926, + "compression/movement_sparsity/importance_threshold": -0.0003473300492071018, + "compression/movement_sparsity/linear_layer_sparsity": 0.8938983986510618, + "compression/movement_sparsity/model_sparsity": 0.863190238704272, + "compression_loss": 101.03450775146484, + "distillation_loss": 5.289562225341797, + "epoch": 3.9, + "learning_rate": 3.389217619986851e-05, + "loss": 106.3182, + "step": 4613, + "task_loss": 3.251208782196045 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.950521798734454, + "compression/movement_sparsity/importance_threshold": -0.00034653220101423833, + "compression/movement_sparsity/linear_layer_sparsity": 0.8939816293411602, + "compression/movement_sparsity/model_sparsity": 0.863270610164118, + "compression_loss": 101.04598236083984, + "distillation_loss": 5.181283473968506, + "epoch": 3.9, + "learning_rate": 3.38874800413262e-05, + "loss": 105.5977, + "step": 4614, + "task_loss": 3.2465696334838867 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9506355416812919, + "compression/movement_sparsity/importance_threshold": -0.0003457355755769869, + "compression/movement_sparsity/linear_layer_sparsity": 0.8939361386416294, + "compression/movement_sparsity/model_sparsity": 0.8632266822100618, + "compression_loss": 101.05746459960938, + "distillation_loss": 3.699388027191162, + "epoch": 3.9, + "learning_rate": 3.3882783882783884e-05, + "loss": 104.9557, + "step": 4615, + "task_loss": 2.6295366287231445 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9507491101757917, + "compression/movement_sparsity/importance_threshold": -0.00034494017195765055, + "compression/movement_sparsity/linear_layer_sparsity": 0.8938660483842658, + "compression/movement_sparsity/model_sparsity": 0.8631589997686615, + "compression_loss": 101.0689697265625, + "distillation_loss": 4.068443298339844, + "epoch": 3.9, + "learning_rate": 3.387808772424157e-05, + "loss": 105.0924, + "step": 4616, + "task_loss": 1.592682957649231 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9508625043518383, + "compression/movement_sparsity/importance_threshold": -0.00034414598921853404, + "compression/movement_sparsity/linear_layer_sparsity": 0.8939132680881038, + "compression/movement_sparsity/model_sparsity": 0.8632045973304079, + "compression_loss": 101.08039855957031, + "distillation_loss": 4.861985683441162, + "epoch": 3.9, + "learning_rate": 3.387339156569926e-05, + "loss": 106.1402, + "step": 4617, + "task_loss": 2.5748844146728516 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.950975724343317, + "compression/movement_sparsity/importance_threshold": -0.0003433530264219404, + "compression/movement_sparsity/linear_layer_sparsity": 0.8939078068193266, + "compression/movement_sparsity/model_sparsity": 0.863199323673014, + "compression_loss": 101.09178161621094, + "distillation_loss": 5.338858604431152, + "epoch": 3.9, + "learning_rate": 3.386869540715695e-05, + "loss": 105.9183, + "step": 4618, + "task_loss": 3.4013051986694336 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9510887702841129, + "compression/movement_sparsity/importance_threshold": -0.00034256128263017525, + "compression/movement_sparsity/linear_layer_sparsity": 0.8939497083443989, + "compression/movement_sparsity/model_sparsity": 0.8632397857517959, + "compression_loss": 101.10321044921875, + "distillation_loss": 5.493746757507324, + "epoch": 3.9, + "learning_rate": 3.3863999248614636e-05, + "loss": 105.695, + "step": 4619, + "task_loss": 3.2099924087524414 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9512016423081112, + "compression/movement_sparsity/importance_threshold": -0.0003417707569055399, + "compression/movement_sparsity/linear_layer_sparsity": 0.894004917240553, + "compression/movement_sparsity/model_sparsity": 0.8632930980525246, + "compression_loss": 101.11459350585938, + "distillation_loss": 5.106605052947998, + "epoch": 3.9, + "learning_rate": 3.385930309007232e-05, + "loss": 105.6518, + "step": 4620, + "task_loss": 2.951779842376709 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.951314340549197, + "compression/movement_sparsity/importance_threshold": -0.00034098144831034086, + "compression/movement_sparsity/linear_layer_sparsity": 0.8940775711939584, + "compression/movement_sparsity/model_sparsity": 0.8633632561191208, + "compression_loss": 101.1259765625, + "distillation_loss": 4.7546234130859375, + "epoch": 3.91, + "learning_rate": 3.385460693153001e-05, + "loss": 105.4049, + "step": 4621, + "task_loss": 2.6324379444122314 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9514268651412552, + "compression/movement_sparsity/importance_threshold": -0.0003401933559068829, + "compression/movement_sparsity/linear_layer_sparsity": 0.8941145003411266, + "compression/movement_sparsity/model_sparsity": 0.8633989166364765, + "compression_loss": 101.13729095458984, + "distillation_loss": 4.950292587280273, + "epoch": 3.91, + "learning_rate": 3.3849910772987695e-05, + "loss": 105.8242, + "step": 4622, + "task_loss": 2.4742188453674316 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9515392162181713, + "compression/movement_sparsity/importance_threshold": -0.00033940647875746726, + "compression/movement_sparsity/linear_layer_sparsity": 0.8942456781092887, + "compression/movement_sparsity/model_sparsity": 0.8635255880447522, + "compression_loss": 101.14863586425781, + "distillation_loss": 4.755215644836426, + "epoch": 3.91, + "learning_rate": 3.384521461444539e-05, + "loss": 105.9388, + "step": 4623, + "task_loss": 2.502890110015869 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9516513939138304, + "compression/movement_sparsity/importance_threshold": -0.0003386208159243979, + "compression/movement_sparsity/linear_layer_sparsity": 0.8942765259309627, + "compression/movement_sparsity/model_sparsity": 0.8635553761488527, + "compression_loss": 101.15995788574219, + "distillation_loss": 5.041110992431641, + "epoch": 3.91, + "learning_rate": 3.3840518455903075e-05, + "loss": 105.6302, + "step": 4624, + "task_loss": 2.496145009994507 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9517633983621173, + "compression/movement_sparsity/importance_threshold": -0.0003378363664699813, + "compression/movement_sparsity/linear_layer_sparsity": 0.8944004895777051, + "compression/movement_sparsity/model_sparsity": 0.8636750812629728, + "compression_loss": 101.1712646484375, + "distillation_loss": 4.949995040893555, + "epoch": 3.91, + "learning_rate": 3.383582229736076e-05, + "loss": 105.3401, + "step": 4625, + "task_loss": 2.899430990219116 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9518752296969175, + "compression/movement_sparsity/importance_threshold": -0.00033705312945651964, + "compression/movement_sparsity/linear_layer_sparsity": 0.8945225811301287, + "compression/movement_sparsity/model_sparsity": 0.8637929785949731, + "compression_loss": 101.18251037597656, + "distillation_loss": 4.48978328704834, + "epoch": 3.91, + "learning_rate": 3.383112613881845e-05, + "loss": 105.5277, + "step": 4626, + "task_loss": 1.7026641368865967 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9519868880521161, + "compression/movement_sparsity/importance_threshold": -0.00033627110394631766, + "compression/movement_sparsity/linear_layer_sparsity": 0.8946228037591082, + "compression/movement_sparsity/model_sparsity": 0.8638897582683261, + "compression_loss": 101.19380187988281, + "distillation_loss": 5.110532760620117, + "epoch": 3.91, + "learning_rate": 3.3826429980276134e-05, + "loss": 106.3328, + "step": 4627, + "task_loss": 2.773049831390381 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.952098373561598, + "compression/movement_sparsity/importance_threshold": -0.0003354902890016793, + "compression/movement_sparsity/linear_layer_sparsity": 0.894765059079004, + "compression/movement_sparsity/model_sparsity": 0.8640271266803551, + "compression_loss": 101.205078125, + "distillation_loss": 4.636569976806641, + "epoch": 3.91, + "learning_rate": 3.382173382173383e-05, + "loss": 104.9572, + "step": 4628, + "task_loss": 2.68298077583313 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9522096863592486, + "compression/movement_sparsity/importance_threshold": -0.0003347106836849075, + "compression/movement_sparsity/linear_layer_sparsity": 0.8948279590632832, + "compression/movement_sparsity/model_sparsity": 0.8640878658566713, + "compression_loss": 101.2163314819336, + "distillation_loss": 6.372221946716309, + "epoch": 3.91, + "learning_rate": 3.3817037663191506e-05, + "loss": 106.0487, + "step": 4629, + "task_loss": 3.299360513687134 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.952320826578953, + "compression/movement_sparsity/importance_threshold": -0.00033393228705830714, + "compression/movement_sparsity/linear_layer_sparsity": 0.8948857435796466, + "compression/movement_sparsity/model_sparsity": 0.8641436652971316, + "compression_loss": 101.22756958007812, + "distillation_loss": 4.611349582672119, + "epoch": 3.91, + "learning_rate": 3.38123415046492e-05, + "loss": 106.4568, + "step": 4630, + "task_loss": 2.840118646621704 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9524317943545961, + "compression/movement_sparsity/importance_threshold": -0.00033315509818418205, + "compression/movement_sparsity/linear_layer_sparsity": 0.8949338814443926, + "compression/movement_sparsity/model_sparsity": 0.8641901494781342, + "compression_loss": 101.23886108398438, + "distillation_loss": 4.797815322875977, + "epoch": 3.91, + "learning_rate": 3.3807645346106886e-05, + "loss": 105.6075, + "step": 4631, + "task_loss": 2.321150302886963 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9525425898200633, + "compression/movement_sparsity/importance_threshold": -0.000332379116124837, + "compression/movement_sparsity/linear_layer_sparsity": 0.8949515530608289, + "compression/movement_sparsity/model_sparsity": 0.8642072140201816, + "compression_loss": 101.25005340576172, + "distillation_loss": 5.359033584594727, + "epoch": 3.91, + "learning_rate": 3.380294918756457e-05, + "loss": 105.8669, + "step": 4632, + "task_loss": 2.8965680599212646 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9526532131092398, + "compression/movement_sparsity/importance_threshold": -0.0003316043399425742, + "compression/movement_sparsity/linear_layer_sparsity": 0.8950072389236884, + "compression/movement_sparsity/model_sparsity": 0.8642609869023422, + "compression_loss": 101.26126861572266, + "distillation_loss": 6.089877128601074, + "epoch": 3.92, + "learning_rate": 3.379825302902226e-05, + "loss": 105.6845, + "step": 4633, + "task_loss": 2.9919357299804688 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9527636643560105, + "compression/movement_sparsity/importance_threshold": -0.0003308307686996992, + "compression/movement_sparsity/linear_layer_sparsity": 0.8950930690823313, + "compression/movement_sparsity/model_sparsity": 0.8643438685309913, + "compression_loss": 101.27244567871094, + "distillation_loss": 5.567140579223633, + "epoch": 3.92, + "learning_rate": 3.3793556870479945e-05, + "loss": 105.5333, + "step": 4634, + "task_loss": 2.6638662815093994 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9528739436942606, + "compression/movement_sparsity/importance_threshold": -0.00033005840145851513, + "compression/movement_sparsity/linear_layer_sparsity": 0.8951400741511519, + "compression/movement_sparsity/model_sparsity": 0.8643892588310934, + "compression_loss": 101.28358459472656, + "distillation_loss": 3.47590970993042, + "epoch": 3.92, + "learning_rate": 3.378886071193764e-05, + "loss": 106.1557, + "step": 4635, + "task_loss": 2.424560308456421 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9529840512578753, + "compression/movement_sparsity/importance_threshold": -0.0003292872372813267, + "compression/movement_sparsity/linear_layer_sparsity": 0.8952808747225962, + "compression/movement_sparsity/model_sparsity": 0.8645252224697553, + "compression_loss": 101.29469299316406, + "distillation_loss": 3.531162738800049, + "epoch": 3.92, + "learning_rate": 3.3784164553395324e-05, + "loss": 105.9478, + "step": 4636, + "task_loss": 1.9088877439498901 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9530939871807399, + "compression/movement_sparsity/importance_threshold": -0.00032851727523043606, + "compression/movement_sparsity/linear_layer_sparsity": 0.8953318743875748, + "compression/movement_sparsity/model_sparsity": 0.8645744701393485, + "compression_loss": 101.30580139160156, + "distillation_loss": 3.3463287353515625, + "epoch": 3.92, + "learning_rate": 3.377946839485302e-05, + "loss": 105.5874, + "step": 4637, + "task_loss": 2.1914350986480713 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9532037515967392, + "compression/movement_sparsity/importance_threshold": -0.00032774851436814974, + "compression/movement_sparsity/linear_layer_sparsity": 0.8953269974030117, + "compression/movement_sparsity/model_sparsity": 0.8645697606942085, + "compression_loss": 101.31686401367188, + "distillation_loss": 4.6450605392456055, + "epoch": 3.92, + "learning_rate": 3.37747722363107e-05, + "loss": 106.0068, + "step": 4638, + "task_loss": 3.1419427394866943 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9533133446397585, + "compression/movement_sparsity/importance_threshold": -0.00032698095375677076, + "compression/movement_sparsity/linear_layer_sparsity": 0.8953634257351393, + "compression/movement_sparsity/model_sparsity": 0.8646049376010609, + "compression_loss": 101.32795715332031, + "distillation_loss": 6.041463851928711, + "epoch": 3.92, + "learning_rate": 3.377007607776838e-05, + "loss": 106.3565, + "step": 4639, + "task_loss": 2.350584030151367 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.953422766443683, + "compression/movement_sparsity/importance_threshold": -0.00032621459245860216, + "compression/movement_sparsity/linear_layer_sparsity": 0.8954240601275676, + "compression/movement_sparsity/model_sparsity": 0.8646634890155762, + "compression_loss": 101.33897399902344, + "distillation_loss": 5.515351295471191, + "epoch": 3.92, + "learning_rate": 3.3765379919226076e-05, + "loss": 106.0872, + "step": 4640, + "task_loss": 2.9922573566436768 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9535320171423978, + "compression/movement_sparsity/importance_threshold": -0.0003254494295359487, + "compression/movement_sparsity/linear_layer_sparsity": 0.8954567919677281, + "compression/movement_sparsity/model_sparsity": 0.8646950964163321, + "compression_loss": 101.34993743896484, + "distillation_loss": 4.323127269744873, + "epoch": 3.92, + "learning_rate": 3.376068376068376e-05, + "loss": 105.6536, + "step": 4641, + "task_loss": 1.8231791257858276 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.953641096869788, + "compression/movement_sparsity/importance_threshold": -0.0003246854640511134, + "compression/movement_sparsity/linear_layer_sparsity": 0.8955853941156808, + "compression/movement_sparsity/model_sparsity": 0.8648192806848763, + "compression_loss": 101.36093139648438, + "distillation_loss": 5.5626044273376465, + "epoch": 3.92, + "learning_rate": 3.375598760214145e-05, + "loss": 105.8588, + "step": 4642, + "task_loss": 1.7434442043304443 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9537500057597388, + "compression/movement_sparsity/importance_threshold": -0.0003239226950664028, + "compression/movement_sparsity/linear_layer_sparsity": 0.89563008589598, + "compression/movement_sparsity/model_sparsity": 0.8648624371650342, + "compression_loss": 101.37188720703125, + "distillation_loss": 3.5114352703094482, + "epoch": 3.92, + "learning_rate": 3.3751291443599136e-05, + "loss": 105.8041, + "step": 4643, + "task_loss": 1.7396771907806396 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9538587439461352, + "compression/movement_sparsity/importance_threshold": -0.000323161121644119, + "compression/movement_sparsity/linear_layer_sparsity": 0.8955506947878604, + "compression/movement_sparsity/model_sparsity": 0.8647857733857142, + "compression_loss": 101.38284301757812, + "distillation_loss": 4.426527976989746, + "epoch": 3.93, + "learning_rate": 3.374659528505683e-05, + "loss": 105.6584, + "step": 4644, + "task_loss": 2.5360474586486816 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9539673115628625, + "compression/movement_sparsity/importance_threshold": -0.000322400742846566, + "compression/movement_sparsity/linear_layer_sparsity": 0.8954914793713807, + "compression/movement_sparsity/model_sparsity": 0.8647285922009585, + "compression_loss": 101.393798828125, + "distillation_loss": 5.229212760925293, + "epoch": 3.93, + "learning_rate": 3.3741899126514515e-05, + "loss": 106.0822, + "step": 4645, + "task_loss": 3.2477760314941406 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9540757087438059, + "compression/movement_sparsity/importance_threshold": -0.00032164155773604756, + "compression/movement_sparsity/linear_layer_sparsity": 0.89551049841876, + "compression/movement_sparsity/model_sparsity": 0.8647469578855508, + "compression_loss": 101.40473175048828, + "distillation_loss": 3.6485395431518555, + "epoch": 3.93, + "learning_rate": 3.37372029679722e-05, + "loss": 105.7291, + "step": 4646, + "task_loss": 2.2173216342926025 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9541839356228503, + "compression/movement_sparsity/importance_threshold": -0.00032088356537486856, + "compression/movement_sparsity/linear_layer_sparsity": 0.8956782953057318, + "compression/movement_sparsity/model_sparsity": 0.8649089904332515, + "compression_loss": 101.41555786132812, + "distillation_loss": 5.659440040588379, + "epoch": 3.93, + "learning_rate": 3.373250680942989e-05, + "loss": 105.5045, + "step": 4647, + "task_loss": 2.8491249084472656 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9542919923338811, + "compression/movement_sparsity/importance_threshold": -0.000320126764825332, + "compression/movement_sparsity/linear_layer_sparsity": 0.8957486001981129, + "compression/movement_sparsity/model_sparsity": 0.8649768801362961, + "compression_loss": 101.42647552490234, + "distillation_loss": 5.4664483070373535, + "epoch": 3.93, + "learning_rate": 3.3727810650887574e-05, + "loss": 106.3987, + "step": 4648, + "task_loss": 3.7156589031219482 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9543998790107833, + "compression/movement_sparsity/importance_threshold": -0.00031937115514974173, + "compression/movement_sparsity/linear_layer_sparsity": 0.8958402374263945, + "compression/movement_sparsity/model_sparsity": 0.8650653693438772, + "compression_loss": 101.43732452392578, + "distillation_loss": 4.4118332862854, + "epoch": 3.93, + "learning_rate": 3.372311449234527e-05, + "loss": 105.9383, + "step": 4649, + "task_loss": 1.658392310142517 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9545075957874419, + "compression/movement_sparsity/importance_threshold": -0.00031861673541040345, + "compression/movement_sparsity/linear_layer_sparsity": 0.8958693443195936, + "compression/movement_sparsity/model_sparsity": 0.8650934763257517, + "compression_loss": 101.4481201171875, + "distillation_loss": 6.654323577880859, + "epoch": 3.93, + "learning_rate": 3.3718418333802953e-05, + "loss": 106.2466, + "step": 4650, + "task_loss": 3.152669668197632 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9546151427977424, + "compression/movement_sparsity/importance_threshold": -0.0003178635046696193, + "compression/movement_sparsity/linear_layer_sparsity": 0.8959442400165145, + "compression/movement_sparsity/model_sparsity": 0.8651657991250772, + "compression_loss": 101.45892333984375, + "distillation_loss": 3.328441619873047, + "epoch": 3.93, + "learning_rate": 3.371372217526064e-05, + "loss": 106.0105, + "step": 4651, + "task_loss": 1.8803482055664062 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9547225201755696, + "compression/movement_sparsity/importance_threshold": -0.000317111461989694, + "compression/movement_sparsity/linear_layer_sparsity": 0.8959275700301595, + "compression/movement_sparsity/model_sparsity": 0.8651497018040364, + "compression_loss": 101.46969604492188, + "distillation_loss": 5.105778217315674, + "epoch": 3.93, + "learning_rate": 3.3709026016718326e-05, + "loss": 106.0904, + "step": 4652, + "task_loss": 2.0406389236450195 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9548297280548088, + "compression/movement_sparsity/importance_threshold": -0.0003163606064329324, + "compression/movement_sparsity/linear_layer_sparsity": 0.896016166595694, + "compression/movement_sparsity/model_sparsity": 0.8652352548049899, + "compression_loss": 101.48047637939453, + "distillation_loss": 5.9561262130737305, + "epoch": 3.93, + "learning_rate": 3.370432985817601e-05, + "loss": 106.2431, + "step": 4653, + "task_loss": 3.019451379776001 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9549367665693451, + "compression/movement_sparsity/importance_threshold": -0.00031561093706163664, + "compression/movement_sparsity/linear_layer_sparsity": 0.8960252289630973, + "compression/movement_sparsity/model_sparsity": 0.8652440058521937, + "compression_loss": 101.49125671386719, + "distillation_loss": 5.095379829406738, + "epoch": 3.93, + "learning_rate": 3.3699633699633706e-05, + "loss": 105.8268, + "step": 4654, + "task_loss": 3.170586585998535 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9550436358530636, + "compression/movement_sparsity/importance_threshold": -0.0003148624529381132, + "compression/movement_sparsity/linear_layer_sparsity": 0.896153962276894, + "compression/movement_sparsity/model_sparsity": 0.8653683167806316, + "compression_loss": 101.5019302368164, + "distillation_loss": 5.1330485343933105, + "epoch": 3.93, + "learning_rate": 3.3694937541091385e-05, + "loss": 106.0014, + "step": 4655, + "task_loss": 2.6393909454345703 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9551503360398497, + "compression/movement_sparsity/importance_threshold": -0.0003141151531246625, + "compression/movement_sparsity/linear_layer_sparsity": 0.896142503151796, + "compression/movement_sparsity/model_sparsity": 0.865357251311733, + "compression_loss": 101.51266479492188, + "distillation_loss": 5.0952959060668945, + "epoch": 3.94, + "learning_rate": 3.369024138254908e-05, + "loss": 106.3751, + "step": 4656, + "task_loss": 3.0116846561431885 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9552568672635882, + "compression/movement_sparsity/importance_threshold": -0.0003133690366835919, + "compression/movement_sparsity/linear_layer_sparsity": 0.8961847743260651, + "compression/movement_sparsity/model_sparsity": 0.8653980703411247, + "compression_loss": 101.52335357666016, + "distillation_loss": 3.407650947570801, + "epoch": 3.94, + "learning_rate": 3.3685545224006765e-05, + "loss": 105.9285, + "step": 4657, + "task_loss": 2.41762375831604 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9553632296581643, + "compression/movement_sparsity/importance_threshold": -0.0003126241026772036, + "compression/movement_sparsity/linear_layer_sparsity": 0.8962495344804955, + "compression/movement_sparsity/model_sparsity": 0.8654606057850248, + "compression_loss": 101.53398132324219, + "distillation_loss": 5.331636428833008, + "epoch": 3.94, + "learning_rate": 3.368084906546445e-05, + "loss": 105.6985, + "step": 4658, + "task_loss": 3.657886505126953 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9554694233574633, + "compression/movement_sparsity/importance_threshold": -0.0003118803501678015, + "compression/movement_sparsity/linear_layer_sparsity": 0.8962886934470117, + "compression/movement_sparsity/model_sparsity": 0.8654984195205742, + "compression_loss": 101.54467010498047, + "distillation_loss": 5.184813499450684, + "epoch": 3.94, + "learning_rate": 3.367615290692214e-05, + "loss": 106.2971, + "step": 4659, + "task_loss": 3.0214946269989014 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9555754484953703, + "compression/movement_sparsity/importance_threshold": -0.00031113777821769033, + "compression/movement_sparsity/linear_layer_sparsity": 0.8963683707351545, + "compression/movement_sparsity/model_sparsity": 0.8655753596487532, + "compression_loss": 101.5553207397461, + "distillation_loss": 3.6411643028259277, + "epoch": 3.94, + "learning_rate": 3.3671456748379824e-05, + "loss": 105.1734, + "step": 4660, + "task_loss": 1.627065896987915 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9556813052057704, + "compression/movement_sparsity/importance_threshold": -0.000310396385889174, + "compression/movement_sparsity/linear_layer_sparsity": 0.8964309010702369, + "compression/movement_sparsity/model_sparsity": 0.8656357418744598, + "compression_loss": 101.56591033935547, + "distillation_loss": 6.332613468170166, + "epoch": 3.94, + "learning_rate": 3.366676058983752e-05, + "loss": 106.0168, + "step": 4661, + "task_loss": 3.449110269546509 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9557869936225487, + "compression/movement_sparsity/importance_threshold": -0.00030965617224455646, + "compression/movement_sparsity/linear_layer_sparsity": 0.8964505878710037, + "compression/movement_sparsity/model_sparsity": 0.8656547523730566, + "compression_loss": 101.57649993896484, + "distillation_loss": 2.3319077491760254, + "epoch": 3.94, + "learning_rate": 3.36620644312952e-05, + "loss": 105.6631, + "step": 4662, + "task_loss": 1.0662107467651367 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9558925138795905, + "compression/movement_sparsity/importance_threshold": -0.0003089171363461407, + "compression/movement_sparsity/linear_layer_sparsity": 0.896442491361179, + "compression/movement_sparsity/model_sparsity": 0.8656469340032521, + "compression_loss": 101.58708190917969, + "distillation_loss": 2.981451988220215, + "epoch": 3.94, + "learning_rate": 3.365736827275289e-05, + "loss": 105.858, + "step": 4663, + "task_loss": 2.2606122493743896 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9559978661107807, + "compression/movement_sparsity/importance_threshold": -0.0003081792772562323, + "compression/movement_sparsity/linear_layer_sparsity": 0.8964808275601284, + "compression/movement_sparsity/model_sparsity": 0.8656839532358316, + "compression_loss": 101.59762573242188, + "distillation_loss": 4.395840167999268, + "epoch": 3.94, + "learning_rate": 3.3652672114210576e-05, + "loss": 105.6962, + "step": 4664, + "task_loss": 1.8972145318984985 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9561030504500045, + "compression/movement_sparsity/importance_threshold": -0.00030744259403713437, + "compression/movement_sparsity/linear_layer_sparsity": 0.8965512278458505, + "compression/movement_sparsity/model_sparsity": 0.8657519350551626, + "compression_loss": 101.60818481445312, + "distillation_loss": 5.024073600769043, + "epoch": 3.94, + "learning_rate": 3.364797595566826e-05, + "loss": 105.8219, + "step": 4665, + "task_loss": 3.489093542098999 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9562080670311472, + "compression/movement_sparsity/importance_threshold": -0.0003067070857511508, + "compression/movement_sparsity/linear_layer_sparsity": 0.8966131142758806, + "compression/movement_sparsity/model_sparsity": 0.8658116954959362, + "compression_loss": 101.61870574951172, + "distillation_loss": 4.183615207672119, + "epoch": 3.94, + "learning_rate": 3.3643279797125955e-05, + "loss": 105.8607, + "step": 4666, + "task_loss": 1.938394546508789 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.956312915988094, + "compression/movement_sparsity/importance_threshold": -0.00030597275146058544, + "compression/movement_sparsity/linear_layer_sparsity": 0.8966769920419059, + "compression/movement_sparsity/model_sparsity": 0.8658733788641876, + "compression_loss": 101.62920379638672, + "distillation_loss": 3.818817615509033, + "epoch": 3.94, + "learning_rate": 3.363858363858364e-05, + "loss": 105.7471, + "step": 4667, + "task_loss": 2.4525699615478516 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9564175974547297, + "compression/movement_sparsity/importance_threshold": -0.0003052395902277431, + "compression/movement_sparsity/linear_layer_sparsity": 0.8966454168460062, + "compression/movement_sparsity/model_sparsity": 0.8658428883734036, + "compression_loss": 101.63967895507812, + "distillation_loss": 3.6939475536346436, + "epoch": 3.95, + "learning_rate": 3.363388748004133e-05, + "loss": 105.8869, + "step": 4668, + "task_loss": 2.4718070030212402 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9565221115649396, + "compression/movement_sparsity/importance_threshold": -0.00030450760111492767, + "compression/movement_sparsity/linear_layer_sparsity": 0.8966742494833496, + "compression/movement_sparsity/model_sparsity": 0.8658707305209548, + "compression_loss": 101.65015411376953, + "distillation_loss": 3.8367724418640137, + "epoch": 3.95, + "learning_rate": 3.3629191321499014e-05, + "loss": 106.0676, + "step": 4669, + "task_loss": 2.6185972690582275 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.956626458452609, + "compression/movement_sparsity/importance_threshold": -0.0003037767831844413, + "compression/movement_sparsity/linear_layer_sparsity": 0.8967708948620383, + "compression/movement_sparsity/model_sparsity": 0.8659640558335695, + "compression_loss": 101.66059112548828, + "distillation_loss": 5.730309963226318, + "epoch": 3.95, + "learning_rate": 3.362449516295671e-05, + "loss": 105.5365, + "step": 4670, + "task_loss": 2.2513010501861572 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9567306382516227, + "compression/movement_sparsity/importance_threshold": -0.0003030471354985905, + "compression/movement_sparsity/linear_layer_sparsity": 0.8968961343947178, + "compression/movement_sparsity/model_sparsity": 0.8660849930030196, + "compression_loss": 101.67102813720703, + "distillation_loss": 4.1641693115234375, + "epoch": 3.95, + "learning_rate": 3.3619799004414394e-05, + "loss": 106.0015, + "step": 4671, + "task_loss": 2.709794044494629 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9568346510958662, + "compression/movement_sparsity/importance_threshold": -0.0003023186571196783, + "compression/movement_sparsity/linear_layer_sparsity": 0.8970167115778517, + "compression/movement_sparsity/model_sparsity": 0.866201427988974, + "compression_loss": 101.68143463134766, + "distillation_loss": 3.810199022293091, + "epoch": 3.95, + "learning_rate": 3.361510284587207e-05, + "loss": 106.0043, + "step": 4672, + "task_loss": 2.0786898136138916 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9569384971192245, + "compression/movement_sparsity/importance_threshold": -0.00030159134711000776, + "compression/movement_sparsity/linear_layer_sparsity": 0.8970608906189426, + "compression/movement_sparsity/model_sparsity": 0.8662440893440928, + "compression_loss": 101.69183349609375, + "distillation_loss": 5.2797956466674805, + "epoch": 3.95, + "learning_rate": 3.3610406687329766e-05, + "loss": 105.9176, + "step": 4673, + "task_loss": 3.310303211212158 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9570421764555826, + "compression/movement_sparsity/importance_threshold": -0.0003008652045318836, + "compression/movement_sparsity/linear_layer_sparsity": 0.8971112225305335, + "compression/movement_sparsity/model_sparsity": 0.8662926921996815, + "compression_loss": 101.70219421386719, + "distillation_loss": 3.690847873687744, + "epoch": 3.95, + "learning_rate": 3.360571052878745e-05, + "loss": 105.8331, + "step": 4674, + "task_loss": 1.8293949365615845 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9571456892388259, + "compression/movement_sparsity/importance_threshold": -0.00030014022844760974, + "compression/movement_sparsity/linear_layer_sparsity": 0.8971754818699231, + "compression/movement_sparsity/model_sparsity": 0.8663547440330783, + "compression_loss": 101.71257019042969, + "distillation_loss": 4.485530853271484, + "epoch": 3.95, + "learning_rate": 3.3601014370245146e-05, + "loss": 106.0385, + "step": 4675, + "task_loss": 2.748704671859741 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9572490356028394, + "compression/movement_sparsity/importance_threshold": -0.00029941641791949095, + "compression/movement_sparsity/linear_layer_sparsity": 0.89720132154119, + "compression/movement_sparsity/model_sparsity": 0.866379696032145, + "compression_loss": 101.72290802001953, + "distillation_loss": 4.996626853942871, + "epoch": 3.95, + "learning_rate": 3.3596318211702825e-05, + "loss": 105.9135, + "step": 4676, + "task_loss": 2.5895516872406006 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9573522156815082, + "compression/movement_sparsity/importance_threshold": -0.00029869377200983024, + "compression/movement_sparsity/linear_layer_sparsity": 0.8972854465438611, + "compression/movement_sparsity/model_sparsity": 0.8664609310821756, + "compression_loss": 101.73324584960938, + "distillation_loss": 2.9844400882720947, + "epoch": 3.95, + "learning_rate": 3.359162205316052e-05, + "loss": 105.5686, + "step": 4677, + "task_loss": 2.039533853530884 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9574552296087175, + "compression/movement_sparsity/importance_threshold": -0.00029797228978093154, + "compression/movement_sparsity/linear_layer_sparsity": 0.8972777316074007, + "compression/movement_sparsity/model_sparsity": 0.8664534811775165, + "compression_loss": 101.74354553222656, + "distillation_loss": 4.03771448135376, + "epoch": 3.95, + "learning_rate": 3.3586925894618205e-05, + "loss": 105.8347, + "step": 4678, + "task_loss": 2.953936815261841 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9575580775183525, + "compression/movement_sparsity/importance_threshold": -0.0002972519702950987, + "compression/movement_sparsity/linear_layer_sparsity": 0.8973379486539618, + "compression/movement_sparsity/model_sparsity": 0.866511629583279, + "compression_loss": 101.75387573242188, + "distillation_loss": 4.7701311111450195, + "epoch": 3.95, + "learning_rate": 3.358222973607589e-05, + "loss": 105.8671, + "step": 4679, + "task_loss": 2.3057758808135986 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9576607595442982, + "compression/movement_sparsity/importance_threshold": -0.00029653281261463656, + "compression/movement_sparsity/linear_layer_sparsity": 0.8974160161794738, + "compression/movement_sparsity/model_sparsity": 0.8665870152491257, + "compression_loss": 101.76419830322266, + "distillation_loss": 4.448347568511963, + "epoch": 3.96, + "learning_rate": 3.357753357753358e-05, + "loss": 105.7598, + "step": 4680, + "task_loss": 2.425097942352295 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9577632758204399, + "compression/movement_sparsity/importance_threshold": -0.0002958148158018481, + "compression/movement_sparsity/linear_layer_sparsity": 0.8976089253634868, + "compression/movement_sparsity/model_sparsity": 0.8667732974092098, + "compression_loss": 101.77444458007812, + "distillation_loss": 5.4012274742126465, + "epoch": 3.96, + "learning_rate": 3.3572837418991264e-05, + "loss": 106.2318, + "step": 4681, + "task_loss": 2.782803535461426 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9578656264806625, + "compression/movement_sparsity/importance_threshold": -0.00029509797891903895, + "compression/movement_sparsity/linear_layer_sparsity": 0.8976710622010373, + "compression/movement_sparsity/model_sparsity": 0.8668332996552353, + "compression_loss": 101.78473663330078, + "distillation_loss": 3.6692802906036377, + "epoch": 3.96, + "learning_rate": 3.356814126044896e-05, + "loss": 106.2184, + "step": 4682, + "task_loss": 1.7455674409866333 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9579678116588514, + "compression/movement_sparsity/importance_threshold": -0.0002943823010285113, + "compression/movement_sparsity/linear_layer_sparsity": 0.8977285247648745, + "compression/movement_sparsity/model_sparsity": 0.8668887882032291, + "compression_loss": 101.79500579833984, + "distillation_loss": 4.911920547485352, + "epoch": 3.96, + "learning_rate": 3.356344510190664e-05, + "loss": 106.4521, + "step": 4683, + "task_loss": 2.149130344390869 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9580698314888917, + "compression/movement_sparsity/importance_threshold": -0.000293667781192569, + "compression/movement_sparsity/linear_layer_sparsity": 0.8977314342617776, + "compression/movement_sparsity/model_sparsity": 0.8668915977499629, + "compression_loss": 101.80519104003906, + "distillation_loss": 4.967400550842285, + "epoch": 3.96, + "learning_rate": 3.355874894336433e-05, + "loss": 106.0529, + "step": 4684, + "task_loss": 3.211292266845703 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9581716861046683, + "compression/movement_sparsity/importance_threshold": -0.00029295441847351864, + "compression/movement_sparsity/linear_layer_sparsity": 0.8977829705142999, + "compression/movement_sparsity/model_sparsity": 0.866941363573667, + "compression_loss": 101.8154296875, + "distillation_loss": 2.9081835746765137, + "epoch": 3.96, + "learning_rate": 3.3554052784822016e-05, + "loss": 105.4729, + "step": 4685, + "task_loss": 1.7217074632644653 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9582733756400666, + "compression/movement_sparsity/importance_threshold": -0.00029224221193366144, + "compression/movement_sparsity/linear_layer_sparsity": 0.8979159011351044, + "compression/movement_sparsity/model_sparsity": 0.8670697276187045, + "compression_loss": 101.82569122314453, + "distillation_loss": 3.920755624771118, + "epoch": 3.96, + "learning_rate": 3.35493566262797e-05, + "loss": 106.2553, + "step": 4686, + "task_loss": 2.587437152862549 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9583749002289718, + "compression/movement_sparsity/importance_threshold": -0.0002915311606353013, + "compression/movement_sparsity/linear_layer_sparsity": 0.898004855425668, + "compression/movement_sparsity/model_sparsity": 0.8671556260557318, + "compression_loss": 101.83583068847656, + "distillation_loss": 4.951447010040283, + "epoch": 3.96, + "learning_rate": 3.3544660467737395e-05, + "loss": 106.7086, + "step": 4687, + "task_loss": 3.45975399017334 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9584762600052686, + "compression/movement_sparsity/importance_threshold": -0.00029082126364074565, + "compression/movement_sparsity/linear_layer_sparsity": 0.8980404729143963, + "compression/movement_sparsity/model_sparsity": 0.8671900199741501, + "compression_loss": 101.84609985351562, + "distillation_loss": 4.715553283691406, + "epoch": 3.96, + "learning_rate": 3.353996430919508e-05, + "loss": 106.6261, + "step": 4688, + "task_loss": 2.714484214782715 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9585774551028428, + "compression/movement_sparsity/importance_threshold": -0.00029011252001229313, + "compression/movement_sparsity/linear_layer_sparsity": 0.8980800253784444, + "compression/movement_sparsity/model_sparsity": 0.8672282136893806, + "compression_loss": 101.85623168945312, + "distillation_loss": 3.2695062160491943, + "epoch": 3.96, + "learning_rate": 3.353526815065277e-05, + "loss": 106.0076, + "step": 4689, + "task_loss": 1.284071922302246 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9586784856555789, + "compression/movement_sparsity/importance_threshold": -0.00028940492881225285, + "compression/movement_sparsity/linear_layer_sparsity": 0.8981603704199749, + "compression/movement_sparsity/model_sparsity": 0.8673057986315642, + "compression_loss": 101.86640167236328, + "distillation_loss": 3.63126540184021, + "epoch": 3.96, + "learning_rate": 3.3530571992110454e-05, + "loss": 105.3049, + "step": 4690, + "task_loss": 2.9905450344085693 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9587793517973624, + "compression/movement_sparsity/importance_threshold": -0.0002886984891029261, + "compression/movement_sparsity/linear_layer_sparsity": 0.8981480289064718, + "compression/movement_sparsity/model_sparsity": 0.8672938810870169, + "compression_loss": 101.87654876708984, + "distillation_loss": 2.702547073364258, + "epoch": 3.96, + "learning_rate": 3.352587583356814e-05, + "loss": 106.0171, + "step": 4691, + "task_loss": 2.364891767501831 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9588800536620783, + "compression/movement_sparsity/importance_threshold": -0.0002879931999466177, + "compression/movement_sparsity/linear_layer_sparsity": 0.8982173798654419, + "compression/movement_sparsity/model_sparsity": 0.867360849627198, + "compression_loss": 101.88666534423828, + "distillation_loss": 5.127796173095703, + "epoch": 3.97, + "learning_rate": 3.3521179675025834e-05, + "loss": 106.386, + "step": 4692, + "task_loss": 3.3984546661376953 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9589805913836119, + "compression/movement_sparsity/importance_threshold": -0.0002872890604056306, + "compression/movement_sparsity/linear_layer_sparsity": 0.8982264899295158, + "compression/movement_sparsity/model_sparsity": 0.8673696467325449, + "compression_loss": 101.89674377441406, + "distillation_loss": 4.128251075744629, + "epoch": 3.97, + "learning_rate": 3.3516483516483513e-05, + "loss": 105.8835, + "step": 4693, + "task_loss": 1.906435251235962 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9590809650958482, + "compression/movement_sparsity/importance_threshold": -0.00028658606954226964, + "compression/movement_sparsity/linear_layer_sparsity": 0.8982572900545192, + "compression/movement_sparsity/model_sparsity": 0.8673993887785022, + "compression_loss": 101.90684509277344, + "distillation_loss": 4.028168678283691, + "epoch": 3.97, + "learning_rate": 3.3511787357941207e-05, + "loss": 106.0872, + "step": 4694, + "task_loss": 1.67355215549469 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9591811749326724, + "compression/movement_sparsity/importance_threshold": -0.0002858842264188378, + "compression/movement_sparsity/linear_layer_sparsity": 0.8982984045845277, + "compression/movement_sparsity/model_sparsity": 0.8674390908979218, + "compression_loss": 101.9168930053711, + "distillation_loss": 3.3944010734558105, + "epoch": 3.97, + "learning_rate": 3.350709119939889e-05, + "loss": 105.5808, + "step": 4695, + "task_loss": 2.459972620010376 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9592812210279696, + "compression/movement_sparsity/importance_threshold": -0.00028518353009764075, + "compression/movement_sparsity/linear_layer_sparsity": 0.8984599412834907, + "compression/movement_sparsity/model_sparsity": 0.8675950783143305, + "compression_loss": 101.92695617675781, + "distillation_loss": 3.7825629711151123, + "epoch": 3.97, + "learning_rate": 3.350239504085658e-05, + "loss": 106.3229, + "step": 4696, + "task_loss": 1.5147826671600342 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.959381103515625, + "compression/movement_sparsity/importance_threshold": -0.0002844839796409815, + "compression/movement_sparsity/linear_layer_sparsity": 0.8984181351517594, + "compression/movement_sparsity/model_sparsity": 0.8675547083518348, + "compression_loss": 101.93692016601562, + "distillation_loss": 5.284494400024414, + "epoch": 3.97, + "learning_rate": 3.349769888231427e-05, + "loss": 106.4689, + "step": 4697, + "task_loss": 3.5198774337768555 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9594808225295237, + "compression/movement_sparsity/importance_threshold": -0.00028378557411116393, + "compression/movement_sparsity/linear_layer_sparsity": 0.8985062309022531, + "compression/movement_sparsity/model_sparsity": 0.8676397777422848, + "compression_loss": 101.94683837890625, + "distillation_loss": 5.379023551940918, + "epoch": 3.97, + "learning_rate": 3.349300272377195e-05, + "loss": 106.342, + "step": 4698, + "task_loss": 2.7044739723205566 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9595803782035508, + "compression/movement_sparsity/importance_threshold": -0.00028308831257049284, + "compression/movement_sparsity/linear_layer_sparsity": 0.8985478343231347, + "compression/movement_sparsity/model_sparsity": 0.867679951957672, + "compression_loss": 101.95682525634766, + "distillation_loss": 5.488298416137695, + "epoch": 3.97, + "learning_rate": 3.3488306565229645e-05, + "loss": 106.6213, + "step": 4699, + "task_loss": 3.1338329315185547 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9596797706715915, + "compression/movement_sparsity/importance_threshold": -0.0002823921940812704, + "compression/movement_sparsity/linear_layer_sparsity": 0.8985507080475349, + "compression/movement_sparsity/model_sparsity": 0.8676827269607985, + "compression_loss": 101.96670532226562, + "distillation_loss": 4.243288993835449, + "epoch": 3.97, + "learning_rate": 3.348361040668733e-05, + "loss": 106.7234, + "step": 4700, + "task_loss": 1.9602845907211304 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9597790000675309, + "compression/movement_sparsity/importance_threshold": -0.00028169721770580306, + "compression/movement_sparsity/linear_layer_sparsity": 0.8986428341666896, + "compression/movement_sparsity/model_sparsity": 0.8677716882643471, + "compression_loss": 101.97666931152344, + "distillation_loss": 3.7543842792510986, + "epoch": 3.97, + "learning_rate": 3.3478914248145025e-05, + "loss": 106.0184, + "step": 4701, + "task_loss": 2.676030158996582 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9598780665252543, + "compression/movement_sparsity/importance_threshold": -0.00028100338250639217, + "compression/movement_sparsity/linear_layer_sparsity": 0.8987100983963234, + "compression/movement_sparsity/model_sparsity": 0.8678366417607641, + "compression_loss": 101.98658752441406, + "distillation_loss": 5.363182067871094, + "epoch": 3.97, + "learning_rate": 3.3474218089602704e-05, + "loss": 106.677, + "step": 4702, + "task_loss": 3.5991029739379883 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9599769701786466, + "compression/movement_sparsity/importance_threshold": -0.00028031068754534334, + "compression/movement_sparsity/linear_layer_sparsity": 0.8988445076139149, + "compression/movement_sparsity/model_sparsity": 0.8679664336082402, + "compression_loss": 101.9964599609375, + "distillation_loss": 4.069483757019043, + "epoch": 3.97, + "learning_rate": 3.346952193106039e-05, + "loss": 106.7822, + "step": 4703, + "task_loss": 2.5340805053710938 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9600757111615931, + "compression/movement_sparsity/importance_threshold": -0.00027961913188496047, + "compression/movement_sparsity/linear_layer_sparsity": 0.8989302066067139, + "compression/movement_sparsity/model_sparsity": 0.8680491885769955, + "compression_loss": 102.00642395019531, + "distillation_loss": 3.8172523975372314, + "epoch": 3.98, + "learning_rate": 3.3464825772518084e-05, + "loss": 107.1222, + "step": 4704, + "task_loss": 2.0957939624786377 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9601742896079788, + "compression/movement_sparsity/importance_threshold": -0.00027892871458754746, + "compression/movement_sparsity/linear_layer_sparsity": 0.8989639758494586, + "compression/movement_sparsity/model_sparsity": 0.8680817977423657, + "compression_loss": 102.01628112792969, + "distillation_loss": 3.121922492980957, + "epoch": 3.98, + "learning_rate": 3.346012961397577e-05, + "loss": 106.5005, + "step": 4705, + "task_loss": 2.029348373413086 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.960272705651689, + "compression/movement_sparsity/importance_threshold": -0.0002782394347154082, + "compression/movement_sparsity/linear_layer_sparsity": 0.8990171337887793, + "compression/movement_sparsity/model_sparsity": 0.8681331295429378, + "compression_loss": 102.0261459350586, + "distillation_loss": 4.173529624938965, + "epoch": 3.98, + "learning_rate": 3.3455433455433456e-05, + "loss": 106.5012, + "step": 4706, + "task_loss": 2.138794422149658 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9603709594266087, + "compression/movement_sparsity/importance_threshold": -0.0002775512913308475, + "compression/movement_sparsity/linear_layer_sparsity": 0.8989910794824949, + "compression/movement_sparsity/model_sparsity": 0.8681079702822266, + "compression_loss": 102.03602600097656, + "distillation_loss": 4.12360143661499, + "epoch": 3.98, + "learning_rate": 3.345073729689114e-05, + "loss": 106.5132, + "step": 4707, + "task_loss": 2.365708589553833 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9604690510666232, + "compression/movement_sparsity/importance_threshold": -0.0002768642834961666, + "compression/movement_sparsity/linear_layer_sparsity": 0.8989867510096431, + "compression/movement_sparsity/model_sparsity": 0.8681037905057333, + "compression_loss": 102.04580688476562, + "distillation_loss": 3.2090349197387695, + "epoch": 3.98, + "learning_rate": 3.3446041138348836e-05, + "loss": 106.082, + "step": 4708, + "task_loss": 1.8742254972457886 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9605669807056174, + "compression/movement_sparsity/importance_threshold": -0.00027617841027367286, + "compression/movement_sparsity/linear_layer_sparsity": 0.899009645411504, + "compression/movement_sparsity/model_sparsity": 0.8681258984144588, + "compression_loss": 102.05554962158203, + "distillation_loss": 3.7272791862487793, + "epoch": 3.98, + "learning_rate": 3.344134497980652e-05, + "loss": 105.9248, + "step": 4709, + "task_loss": 1.3674213886260986 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9606647484774767, + "compression/movement_sparsity/importance_threshold": -0.00027549367072566763, + "compression/movement_sparsity/linear_layer_sparsity": 0.8990709237089847, + "compression/movement_sparsity/model_sparsity": 0.8681850716139069, + "compression_loss": 102.06529998779297, + "distillation_loss": 4.138559341430664, + "epoch": 3.98, + "learning_rate": 3.34366488212642e-05, + "loss": 106.6362, + "step": 4710, + "task_loss": 2.0805158615112305 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9607623545160862, + "compression/movement_sparsity/importance_threshold": -0.0002748100639144565, + "compression/movement_sparsity/linear_layer_sparsity": 0.899139583066232, + "compression/movement_sparsity/model_sparsity": 0.8682513723110119, + "compression_loss": 102.07511138916016, + "distillation_loss": 5.992580413818359, + "epoch": 3.98, + "learning_rate": 3.3431952662721895e-05, + "loss": 107.424, + "step": 4711, + "task_loss": 2.6298882961273193 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9608597989553309, + "compression/movement_sparsity/importance_threshold": -0.00027412758890234257, + "compression/movement_sparsity/linear_layer_sparsity": 0.8991688330494427, + "compression/movement_sparsity/model_sparsity": 0.8682796174673159, + "compression_loss": 102.08480072021484, + "distillation_loss": 3.034358024597168, + "epoch": 3.98, + "learning_rate": 3.342725650417958e-05, + "loss": 105.9314, + "step": 4712, + "task_loss": 2.4491961002349854 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.960957081929096, + "compression/movement_sparsity/importance_threshold": -0.0002734462447516305, + "compression/movement_sparsity/linear_layer_sparsity": 0.899139547293729, + "compression/movement_sparsity/model_sparsity": 0.8682513377674045, + "compression_loss": 102.09453582763672, + "distillation_loss": 5.255185127258301, + "epoch": 3.98, + "learning_rate": 3.3422560345637274e-05, + "loss": 106.4459, + "step": 4713, + "task_loss": 2.728890895843506 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9610542035712667, + "compression/movement_sparsity/importance_threshold": -0.0002727660305246243, + "compression/movement_sparsity/linear_layer_sparsity": 0.8992183541176345, + "compression/movement_sparsity/model_sparsity": 0.8683274373344706, + "compression_loss": 102.1041259765625, + "distillation_loss": 5.061298370361328, + "epoch": 3.98, + "learning_rate": 3.341786418709496e-05, + "loss": 106.3038, + "step": 4714, + "task_loss": 2.866028308868408 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.961151164015728, + "compression/movement_sparsity/importance_threshold": -0.0002720869452836278, + "compression/movement_sparsity/linear_layer_sparsity": 0.8992474729350013, + "compression/movement_sparsity/model_sparsity": 0.8683555558308809, + "compression_loss": 102.11378479003906, + "distillation_loss": 4.558058261871338, + "epoch": 3.99, + "learning_rate": 3.341316802855265e-05, + "loss": 106.215, + "step": 4715, + "task_loss": 2.8137505054473877 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9612479633963652, + "compression/movement_sparsity/importance_threshold": -0.000271408988090944, + "compression/movement_sparsity/linear_layer_sparsity": 0.8993604424991835, + "compression/movement_sparsity/model_sparsity": 0.8684646445429984, + "compression_loss": 102.12342834472656, + "distillation_loss": 4.739848613739014, + "epoch": 3.99, + "learning_rate": 3.340847187001033e-05, + "loss": 106.535, + "step": 4716, + "task_loss": 1.9558812379837036 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9613446018470634, + "compression/movement_sparsity/importance_threshold": -0.0002707321580088778, + "compression/movement_sparsity/linear_layer_sparsity": 0.8993654744979258, + "compression/movement_sparsity/model_sparsity": 0.8684695036771036, + "compression_loss": 102.13296508789062, + "distillation_loss": 5.490961074829102, + "epoch": 3.99, + "learning_rate": 3.340377571146802e-05, + "loss": 107.3266, + "step": 4717, + "task_loss": 2.1332499980926514 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9614410795017077, + "compression/movement_sparsity/importance_threshold": -0.00027005645409973293, + "compression/movement_sparsity/linear_layer_sparsity": 0.8993622311243288, + "compression/movement_sparsity/model_sparsity": 0.8684663717233676, + "compression_loss": 102.14260864257812, + "distillation_loss": 5.744338035583496, + "epoch": 3.99, + "learning_rate": 3.339907955292571e-05, + "loss": 106.5651, + "step": 4718, + "task_loss": 3.526001214981079 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9615373964941832, + "compression/movement_sparsity/importance_threshold": -0.0002693818754258134, + "compression/movement_sparsity/linear_layer_sparsity": 0.8995838417798413, + "compression/movement_sparsity/model_sparsity": 0.8686803693711092, + "compression_loss": 102.15216827392578, + "distillation_loss": 4.061690807342529, + "epoch": 3.99, + "learning_rate": 3.339438339438339e-05, + "loss": 107.264, + "step": 4719, + "task_loss": 3.2331526279449463 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9616335529583752, + "compression/movement_sparsity/importance_threshold": -0.00026870842104942394, + "compression/movement_sparsity/linear_layer_sparsity": 0.8996112196687333, + "compression/movement_sparsity/model_sparsity": 0.8687068067452933, + "compression_loss": 102.16175079345703, + "distillation_loss": 5.684405326843262, + "epoch": 3.99, + "learning_rate": 3.3389687235841085e-05, + "loss": 107.0543, + "step": 4720, + "task_loss": 3.286560535430908 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9617295490281687, + "compression/movement_sparsity/importance_threshold": -0.0002680360900328676, + "compression/movement_sparsity/linear_layer_sparsity": 0.8997705623208513, + "compression/movement_sparsity/model_sparsity": 0.8688606754871158, + "compression_loss": 102.17129516601562, + "distillation_loss": 4.273404121398926, + "epoch": 3.99, + "learning_rate": 3.338499107729877e-05, + "loss": 106.9224, + "step": 4721, + "task_loss": 1.763748288154602 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9618253848374488, + "compression/movement_sparsity/importance_threshold": -0.00026736488143844824, + "compression/movement_sparsity/linear_layer_sparsity": 0.8998249842219413, + "compression/movement_sparsity/model_sparsity": 0.8689132278284821, + "compression_loss": 102.18086242675781, + "distillation_loss": 4.594601631164551, + "epoch": 3.99, + "learning_rate": 3.338029491875646e-05, + "loss": 106.3894, + "step": 4722, + "task_loss": 2.9253575801849365 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9619210605201008, + "compression/movement_sparsity/importance_threshold": -0.0002666947943284698, + "compression/movement_sparsity/linear_layer_sparsity": 0.8998434905301123, + "compression/movement_sparsity/model_sparsity": 0.8689310983880352, + "compression_loss": 102.19035339355469, + "distillation_loss": 4.620210647583008, + "epoch": 3.99, + "learning_rate": 3.3375598760214144e-05, + "loss": 106.6182, + "step": 4723, + "task_loss": 2.4557876586914062 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9620165762100098, + "compression/movement_sparsity/importance_threshold": -0.000266025827765237, + "compression/movement_sparsity/linear_layer_sparsity": 0.8999446790166702, + "compression/movement_sparsity/model_sparsity": 0.8690288107387877, + "compression_loss": 102.19989776611328, + "distillation_loss": 5.3723464012146, + "epoch": 3.99, + "learning_rate": 3.337090260167183e-05, + "loss": 106.8429, + "step": 4724, + "task_loss": 3.2323360443115234 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9621119320410608, + "compression/movement_sparsity/importance_threshold": -0.0002653579808110538, + "compression/movement_sparsity/linear_layer_sparsity": 0.9000556930173601, + "compression/movement_sparsity/model_sparsity": 0.8691360110670349, + "compression_loss": 102.20938873291016, + "distillation_loss": 4.161534309387207, + "epoch": 3.99, + "learning_rate": 3.3366206443129524e-05, + "loss": 107.314, + "step": 4725, + "task_loss": 2.3385519981384277 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9622071281471392, + "compression/movement_sparsity/importance_threshold": -0.0002646912525282241, + "compression/movement_sparsity/linear_layer_sparsity": 0.9001602321950237, + "compression/movement_sparsity/model_sparsity": 0.8692369590023457, + "compression_loss": 102.2188720703125, + "distillation_loss": 4.973778247833252, + "epoch": 3.99, + "learning_rate": 3.336151028458721e-05, + "loss": 107.117, + "step": 4726, + "task_loss": 2.5603156089782715 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9623021646621298, + "compression/movement_sparsity/importance_threshold": -0.00026402564197905085, + "compression/movement_sparsity/linear_layer_sparsity": 0.9001480695440351, + "compression/movement_sparsity/model_sparsity": 0.8692252141758352, + "compression_loss": 102.22834777832031, + "distillation_loss": 4.6997151374816895, + "epoch": 4.0, + "learning_rate": 3.3356814126044896e-05, + "loss": 107.3776, + "step": 4727, + "task_loss": 2.01159930229187 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9623970417199179, + "compression/movement_sparsity/importance_threshold": -0.0002633611482258406, + "compression/movement_sparsity/linear_layer_sparsity": 0.9001439080095301, + "compression/movement_sparsity/model_sparsity": 0.8692211956028428, + "compression_loss": 102.23784637451172, + "distillation_loss": 4.433721542358398, + "epoch": 4.0, + "learning_rate": 3.335211796750258e-05, + "loss": 106.6445, + "step": 4728, + "task_loss": 3.8411970138549805 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9624917594543888, + "compression/movement_sparsity/importance_threshold": -0.0002626977703308938, + "compression/movement_sparsity/linear_layer_sparsity": 0.9002022648859401, + "compression/movement_sparsity/model_sparsity": 0.8692775477410214, + "compression_loss": 102.24736022949219, + "distillation_loss": 4.056196689605713, + "epoch": 4.0, + "learning_rate": 3.334742180896027e-05, + "loss": 106.2125, + "step": 4729, + "task_loss": 1.5910521745681763 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9625863179994274, + "compression/movement_sparsity/importance_threshold": -0.0002620355073565169, + "compression/movement_sparsity/linear_layer_sparsity": 0.9002586304263548, + "compression/movement_sparsity/model_sparsity": 0.8693319769517222, + "compression_loss": 102.25675964355469, + "distillation_loss": 5.15647029876709, + "epoch": 4.0, + "learning_rate": 3.334272565041796e-05, + "loss": 107.6808, + "step": 4730, + "task_loss": 4.085256099700928 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9626807174889191, + "compression/movement_sparsity/importance_threshold": -0.0002613743583650121, + "compression/movement_sparsity/linear_layer_sparsity": 0.9002695768122445, + "compression/movement_sparsity/model_sparsity": 0.8693425472955816, + "compression_loss": 102.26620483398438, + "distillation_loss": 3.7740845680236816, + "epoch": 4.0, + "learning_rate": 3.333802949187565e-05, + "loss": 106.1293, + "step": 4731, + "task_loss": 2.223395586013794 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9627749580567488, + "compression/movement_sparsity/importance_threshold": -0.00026071432241868506, + "compression/movement_sparsity/linear_layer_sparsity": 0.9002622672974837, + "compression/movement_sparsity/model_sparsity": 0.8693354888851395, + "compression_loss": 102.27556610107422, + "distillation_loss": 3.833007335662842, + "epoch": 4.0, + "learning_rate": 3.3333333333333335e-05, + "loss": 106.8324, + "step": 4732, + "task_loss": 2.9822402000427246 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9629629629629629, + "compression/movement_sparsity/importance_threshold": -0.0002593975859107797, + "compression/movement_sparsity/linear_layer_sparsity": 0.9003952217666237, + "compression/movement_sparsity/model_sparsity": 0.8694638759592487, + "compression_loss": 102.29492950439453, + "distillation_loss": 2.872431755065918, + "epoch": 4.0, + "learning_rate": 3.332863717479102e-05, + "loss": 186.7031, + "step": 4733, + "task_loss": 1.6889793872833252 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9630567275691178, + "compression/movement_sparsity/importance_threshold": -0.0002587408834738075, + "compression/movement_sparsity/linear_layer_sparsity": 0.9005190542475221, + "compression/movement_sparsity/model_sparsity": 0.869583454413475, + "compression_loss": 102.30427551269531, + "distillation_loss": 5.131381988525391, + "epoch": 4.0, + "learning_rate": 3.332394101624871e-05, + "loss": 106.904, + "step": 4734, + "task_loss": 3.1679792404174805 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9631503337891512, + "compression/movement_sparsity/importance_threshold": -0.00025808529033122946, + "compression/movement_sparsity/linear_layer_sparsity": 0.9005311692018401, + "compression/movement_sparsity/model_sparsity": 0.8695951531818422, + "compression_loss": 102.31358337402344, + "distillation_loss": 4.491264343261719, + "epoch": 4.0, + "learning_rate": 3.33192448577064e-05, + "loss": 106.0944, + "step": 4735, + "task_loss": 3.6429214477539062 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9632437817569484, + "compression/movement_sparsity/importance_threshold": -0.00025743080554534864, + "compression/movement_sparsity/linear_layer_sparsity": 0.9005356049922006, + "compression/movement_sparsity/model_sparsity": 0.8695994365891577, + "compression_loss": 102.32286834716797, + "distillation_loss": 5.38408899307251, + "epoch": 4.0, + "learning_rate": 3.331454869916408e-05, + "loss": 106.8995, + "step": 4736, + "task_loss": 4.077313423156738 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9633370716063946, + "compression/movement_sparsity/importance_threshold": -0.0002567774281784672, + "compression/movement_sparsity/linear_layer_sparsity": 0.9005687184057254, + "compression/movement_sparsity/model_sparsity": 0.8696314124550591, + "compression_loss": 102.3321533203125, + "distillation_loss": 4.414595603942871, + "epoch": 4.0, + "learning_rate": 3.3309852540621773e-05, + "loss": 106.334, + "step": 4737, + "task_loss": 2.7615694999694824 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9634302034713748, + "compression/movement_sparsity/importance_threshold": -0.00025612515729289253, + "compression/movement_sparsity/linear_layer_sparsity": 0.9006416466149864, + "compression/movement_sparsity/model_sparsity": 0.8697018353559786, + "compression_loss": 102.34141540527344, + "distillation_loss": 3.2609522342681885, + "epoch": 4.01, + "learning_rate": 3.330515638207946e-05, + "loss": 105.8877, + "step": 4738, + "task_loss": 1.8966424465179443 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9635231774857742, + "compression/movement_sparsity/importance_threshold": -0.0002554739919509259, + "compression/movement_sparsity/linear_layer_sparsity": 0.9006663892628308, + "compression/movement_sparsity/model_sparsity": 0.8697257280177523, + "compression_loss": 102.3507080078125, + "distillation_loss": 4.083732604980469, + "epoch": 4.01, + "learning_rate": 3.330046022353715e-05, + "loss": 106.7867, + "step": 4739, + "task_loss": 2.075038433074951 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9636159937834781, + "compression/movement_sparsity/importance_threshold": -0.0002548239312148721, + "compression/movement_sparsity/linear_layer_sparsity": 0.9006817237424105, + "compression/movement_sparsity/model_sparsity": 0.869740535710784, + "compression_loss": 102.35995483398438, + "distillation_loss": 3.284346580505371, + "epoch": 4.01, + "learning_rate": 3.329576406499483e-05, + "loss": 106.0452, + "step": 4740, + "task_loss": 1.9546148777008057 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9637086524983712, + "compression/movement_sparsity/importance_threshold": -0.00025417497414703585, + "compression/movement_sparsity/linear_layer_sparsity": 0.9007792634336719, + "compression/movement_sparsity/model_sparsity": 0.8698347246135834, + "compression_loss": 102.3691635131836, + "distillation_loss": 4.348479747772217, + "epoch": 4.01, + "learning_rate": 3.3291067906452526e-05, + "loss": 106.5826, + "step": 4741, + "task_loss": 1.425667643547058 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9638011537643392, + "compression/movement_sparsity/importance_threshold": -0.0002535271198097202, + "compression/movement_sparsity/linear_layer_sparsity": 0.9009048726155481, + "compression/movement_sparsity/model_sparsity": 0.8699560187336431, + "compression_loss": 102.37838745117188, + "distillation_loss": 6.096728324890137, + "epoch": 4.01, + "learning_rate": 3.328637174791021e-05, + "loss": 107.232, + "step": 4742, + "task_loss": 3.0057220458984375 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9638934977152669, + "compression/movement_sparsity/importance_threshold": -0.0002528803672652291, + "compression/movement_sparsity/linear_layer_sparsity": 0.9009324055186193, + "compression/movement_sparsity/model_sparsity": 0.8699826057967927, + "compression_loss": 102.3875961303711, + "distillation_loss": 2.8381001949310303, + "epoch": 4.01, + "learning_rate": 3.32816755893679e-05, + "loss": 105.6779, + "step": 4743, + "task_loss": 1.089234709739685 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9639856844850394, + "compression/movement_sparsity/importance_threshold": -0.0002522347155758673, + "compression/movement_sparsity/linear_layer_sparsity": 0.9009738658494891, + "compression/movement_sparsity/model_sparsity": 0.8700226418377504, + "compression_loss": 102.39678955078125, + "distillation_loss": 3.860442876815796, + "epoch": 4.01, + "learning_rate": 3.327697943082559e-05, + "loss": 106.7943, + "step": 4744, + "task_loss": 2.458700656890869 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9640777142075421, + "compression/movement_sparsity/importance_threshold": -0.0002515901638039386, + "compression/movement_sparsity/linear_layer_sparsity": 0.9010083028456215, + "compression/movement_sparsity/model_sparsity": 0.870055895817125, + "compression_loss": 102.40593719482422, + "distillation_loss": 3.7795801162719727, + "epoch": 4.01, + "learning_rate": 3.327228327228327e-05, + "loss": 106.849, + "step": 4745, + "task_loss": 2.152385711669922 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9641695870166599, + "compression/movement_sparsity/importance_threshold": -0.00025094671101174616, + "compression/movement_sparsity/linear_layer_sparsity": 0.9011130208857997, + "compression/movement_sparsity/model_sparsity": 0.8701570164704726, + "compression_loss": 102.41505432128906, + "distillation_loss": 3.747814655303955, + "epoch": 4.01, + "learning_rate": 3.3267587113740964e-05, + "loss": 107.1793, + "step": 4746, + "task_loss": 2.3008198738098145 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9642613030462781, + "compression/movement_sparsity/importance_threshold": -0.0002503043562615947, + "compression/movement_sparsity/linear_layer_sparsity": 0.9011355456384638, + "compression/movement_sparsity/model_sparsity": 0.8701787674285885, + "compression_loss": 102.42414093017578, + "distillation_loss": 4.956059455871582, + "epoch": 4.01, + "learning_rate": 3.326289095519865e-05, + "loss": 107.2513, + "step": 4747, + "task_loss": 2.8391451835632324 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9643528624302817, + "compression/movement_sparsity/importance_threshold": -0.0002496630986157881, + "compression/movement_sparsity/linear_layer_sparsity": 0.901165892645097, + "compression/movement_sparsity/model_sparsity": 0.8702080719221856, + "compression_loss": 102.43321990966797, + "distillation_loss": 5.313861846923828, + "epoch": 4.01, + "learning_rate": 3.325819479665634e-05, + "loss": 107.2045, + "step": 4748, + "task_loss": 2.8192031383514404 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.964444265302556, + "compression/movement_sparsity/importance_threshold": -0.00024902293713663023, + "compression/movement_sparsity/linear_layer_sparsity": 0.9011905160512651, + "compression/movement_sparsity/model_sparsity": 0.8702318494386013, + "compression_loss": 102.44236755371094, + "distillation_loss": 5.839061260223389, + "epoch": 4.01, + "learning_rate": 3.325349863811402e-05, + "loss": 106.8445, + "step": 4749, + "task_loss": 2.828413724899292 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.964535511796986, + "compression/movement_sparsity/importance_threshold": -0.00024838387088642506, + "compression/movement_sparsity/linear_layer_sparsity": 0.901203847270682, + "compression/movement_sparsity/model_sparsity": 0.8702447226896196, + "compression_loss": 102.45146179199219, + "distillation_loss": 5.051539421081543, + "epoch": 4.02, + "learning_rate": 3.324880247957171e-05, + "loss": 107.196, + "step": 4750, + "task_loss": 2.9877536296844482 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.964626602047457, + "compression/movement_sparsity/importance_threshold": -0.00024774589892747646, + "compression/movement_sparsity/linear_layer_sparsity": 0.9011523110181598, + "compression/movement_sparsity/model_sparsity": 0.8701949568659156, + "compression_loss": 102.4605484008789, + "distillation_loss": 5.509265422821045, + "epoch": 4.02, + "learning_rate": 3.32441063210294e-05, + "loss": 106.9123, + "step": 4751, + "task_loss": 2.274599075317383 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9647175361878539, + "compression/movement_sparsity/importance_threshold": -0.0002471090203220883, + "compression/movement_sparsity/linear_layer_sparsity": 0.9011253266267999, + "compression/movement_sparsity/model_sparsity": 0.8701688994714125, + "compression_loss": 102.46958923339844, + "distillation_loss": 4.1670098304748535, + "epoch": 4.02, + "learning_rate": 3.323941016248709e-05, + "loss": 106.8653, + "step": 4752, + "task_loss": 2.10685396194458 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.964808314352062, + "compression/movement_sparsity/importance_threshold": -0.0002464732341325654, + "compression/movement_sparsity/linear_layer_sparsity": 0.9011417700539698, + "compression/movement_sparsity/model_sparsity": 0.8701847780162733, + "compression_loss": 102.47862243652344, + "distillation_loss": 3.422401189804077, + "epoch": 4.02, + "learning_rate": 3.3234714003944775e-05, + "loss": 106.9227, + "step": 4753, + "task_loss": 1.9792814254760742 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9648989366739665, + "compression/movement_sparsity/importance_threshold": -0.0002458385394212108, + "compression/movement_sparsity/linear_layer_sparsity": 0.9011828845839782, + "compression/movement_sparsity/model_sparsity": 0.8702244801356929, + "compression_loss": 102.4876480102539, + "distillation_loss": 3.653017044067383, + "epoch": 4.02, + "learning_rate": 3.323001784540246e-05, + "loss": 106.6764, + "step": 4754, + "task_loss": 1.923354148864746 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9649894032874524, + "compression/movement_sparsity/importance_threshold": -0.0002452049352503292, + "compression/movement_sparsity/linear_layer_sparsity": 0.9012377238309355, + "compression/movement_sparsity/model_sparsity": 0.8702774354858119, + "compression_loss": 102.49662017822266, + "distillation_loss": 6.276140213012695, + "epoch": 4.02, + "learning_rate": 3.322532168686015e-05, + "loss": 106.7666, + "step": 4755, + "task_loss": 3.2657864093780518 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.965079714326405, + "compression/movement_sparsity/importance_threshold": -0.0002445724206822236, + "compression/movement_sparsity/linear_layer_sparsity": 0.9012958899206632, + "compression/movement_sparsity/model_sparsity": 0.8703336033914176, + "compression_loss": 102.50564575195312, + "distillation_loss": 4.390575885772705, + "epoch": 4.02, + "learning_rate": 3.322062552831784e-05, + "loss": 106.6049, + "step": 4756, + "task_loss": 2.2503719329833984 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9651698699247092, + "compression/movement_sparsity/importance_threshold": -0.00024394099477919977, + "compression/movement_sparsity/linear_layer_sparsity": 0.9014229896234939, + "compression/movement_sparsity/model_sparsity": 0.8704563368284517, + "compression_loss": 102.5146484375, + "distillation_loss": 4.468634605407715, + "epoch": 4.02, + "learning_rate": 3.321592936977552e-05, + "loss": 107.1529, + "step": 4757, + "task_loss": 2.3377504348754883 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9652598702162504, + "compression/movement_sparsity/importance_threshold": -0.0002433106566035589, + "compression/movement_sparsity/linear_layer_sparsity": 0.9014323977917587, + "compression/movement_sparsity/model_sparsity": 0.8704654217971936, + "compression_loss": 102.52351379394531, + "distillation_loss": 4.082657814025879, + "epoch": 4.02, + "learning_rate": 3.3211233211233214e-05, + "loss": 107.2328, + "step": 4758, + "task_loss": 1.8868112564086914 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9653497153349135, + "compression/movement_sparsity/importance_threshold": -0.00024268140521760753, + "compression/movement_sparsity/linear_layer_sparsity": 0.901421153301678, + "compression/movement_sparsity/model_sparsity": 0.8704545635899393, + "compression_loss": 102.53240966796875, + "distillation_loss": 2.7098288536071777, + "epoch": 4.02, + "learning_rate": 3.32065370526909e-05, + "loss": 106.782, + "step": 4759, + "task_loss": 1.1418133974075317 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9654394054145838, + "compression/movement_sparsity/importance_threshold": -0.00024205323968364868, + "compression/movement_sparsity/linear_layer_sparsity": 0.9014085613806545, + "compression/movement_sparsity/model_sparsity": 0.8704424042401403, + "compression_loss": 102.54130554199219, + "distillation_loss": 3.3141701221466064, + "epoch": 4.02, + "learning_rate": 3.3201840894148586e-05, + "loss": 106.5259, + "step": 4760, + "task_loss": 1.3519169092178345 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9655289405891465, + "compression/movement_sparsity/importance_threshold": -0.00024142615906398626, + "compression/movement_sparsity/linear_layer_sparsity": 0.9014684683988571, + "compression/movement_sparsity/model_sparsity": 0.8705002532679721, + "compression_loss": 102.55020141601562, + "distillation_loss": 4.16291618347168, + "epoch": 4.02, + "learning_rate": 3.319714473560628e-05, + "loss": 107.1851, + "step": 4761, + "task_loss": 2.811535120010376 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9656183209924866, + "compression/movement_sparsity/importance_threshold": -0.00024080016242092415, + "compression/movement_sparsity/linear_layer_sparsity": 0.901521423627328, + "compression/movement_sparsity/model_sparsity": 0.8705513893214356, + "compression_loss": 102.55904388427734, + "distillation_loss": 6.311136722564697, + "epoch": 4.03, + "learning_rate": 3.319244857706396e-05, + "loss": 108.194, + "step": 4762, + "task_loss": 3.2755727767944336 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9657075467584892, + "compression/movement_sparsity/importance_threshold": -0.00024017524881676713, + "compression/movement_sparsity/linear_layer_sparsity": 0.901529508212985, + "compression/movement_sparsity/model_sparsity": 0.8705591961767043, + "compression_loss": 102.56788635253906, + "distillation_loss": 4.883666038513184, + "epoch": 4.03, + "learning_rate": 3.318775241852165e-05, + "loss": 107.0493, + "step": 4763, + "task_loss": 2.172210454940796 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9657966180210396, + "compression/movement_sparsity/importance_threshold": -0.00023955141731381822, + "compression/movement_sparsity/linear_layer_sparsity": 0.9015997415603604, + "compression/movement_sparsity/model_sparsity": 0.8706270167925342, + "compression_loss": 102.57667541503906, + "distillation_loss": 6.006497383117676, + "epoch": 4.03, + "learning_rate": 3.318305625997934e-05, + "loss": 108.395, + "step": 4764, + "task_loss": 2.446901321411133 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9658855349140228, + "compression/movement_sparsity/importance_threshold": -0.00023892866697438134, + "compression/movement_sparsity/linear_layer_sparsity": 0.9016180570818491, + "compression/movement_sparsity/model_sparsity": 0.8706447031195146, + "compression_loss": 102.58549499511719, + "distillation_loss": 5.098332405090332, + "epoch": 4.03, + "learning_rate": 3.317836010143703e-05, + "loss": 107.4164, + "step": 4765, + "task_loss": 3.2266478538513184 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9659742975713241, + "compression/movement_sparsity/importance_threshold": -0.00023830699686076037, + "compression/movement_sparsity/linear_layer_sparsity": 0.9016944909963949, + "compression/movement_sparsity/model_sparsity": 0.8707185112939576, + "compression_loss": 102.59429931640625, + "distillation_loss": 6.077312469482422, + "epoch": 4.03, + "learning_rate": 3.317366394289471e-05, + "loss": 107.6598, + "step": 4766, + "task_loss": 2.8815295696258545 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9660629061268285, + "compression/movement_sparsity/importance_threshold": -0.00023768640603526094, + "compression/movement_sparsity/linear_layer_sparsity": 0.9017788544824187, + "compression/movement_sparsity/model_sparsity": 0.870799976634704, + "compression_loss": 102.60308837890625, + "distillation_loss": 5.161858558654785, + "epoch": 4.03, + "learning_rate": 3.31689677843524e-05, + "loss": 107.0557, + "step": 4767, + "task_loss": 2.3756675720214844 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9661513607144212, + "compression/movement_sparsity/importance_threshold": -0.00023706689356018523, + "compression/movement_sparsity/linear_layer_sparsity": 0.901816010188772, + "compression/movement_sparsity/model_sparsity": 0.8708358559282398, + "compression_loss": 102.61189270019531, + "distillation_loss": 5.02192497253418, + "epoch": 4.03, + "learning_rate": 3.316427162581009e-05, + "loss": 106.2566, + "step": 4768, + "task_loss": 2.2402851581573486 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9662396614679873, + "compression/movement_sparsity/importance_threshold": -0.000236448458497838, + "compression/movement_sparsity/linear_layer_sparsity": 0.9017842442061901, + "compression/movement_sparsity/model_sparsity": 0.8708051812048831, + "compression_loss": 102.62062072753906, + "distillation_loss": 4.487470626831055, + "epoch": 4.03, + "learning_rate": 3.315957546726778e-05, + "loss": 106.6382, + "step": 4769, + "task_loss": 1.8975750207901 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9663278085214121, + "compression/movement_sparsity/importance_threshold": -0.00023583109991052315, + "compression/movement_sparsity/linear_layer_sparsity": 0.901828351702275, + "compression/movement_sparsity/model_sparsity": 0.8708477734727871, + "compression_loss": 102.62935638427734, + "distillation_loss": 5.582508563995361, + "epoch": 4.03, + "learning_rate": 3.315487930872546e-05, + "loss": 108.1974, + "step": 4770, + "task_loss": 3.0257253646850586 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9664158020085805, + "compression/movement_sparsity/importance_threshold": -0.00023521481686054457, + "compression/movement_sparsity/linear_layer_sparsity": 0.9019323304440598, + "compression/movement_sparsity/model_sparsity": 0.8709481802249155, + "compression_loss": 102.6380615234375, + "distillation_loss": 4.577693462371826, + "epoch": 4.03, + "learning_rate": 3.315018315018315e-05, + "loss": 106.8369, + "step": 4771, + "task_loss": 1.8708293437957764 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9665036420633777, + "compression/movement_sparsity/importance_threshold": -0.00023459960841020616, + "compression/movement_sparsity/linear_layer_sparsity": 0.90196788831195, + "compression/movement_sparsity/model_sparsity": 0.8709825165706548, + "compression_loss": 102.64679718017578, + "distillation_loss": 4.865073204040527, + "epoch": 4.03, + "learning_rate": 3.314548699164084e-05, + "loss": 106.9721, + "step": 4772, + "task_loss": 1.9343392848968506 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9665913288196889, + "compression/movement_sparsity/importance_threshold": -0.0002339854736218127, + "compression/movement_sparsity/linear_layer_sparsity": 0.9020291427610954, + "compression/movement_sparsity/model_sparsity": 0.8710416667410313, + "compression_loss": 102.655517578125, + "distillation_loss": 4.84503173828125, + "epoch": 4.03, + "learning_rate": 3.314079083309853e-05, + "loss": 107.1568, + "step": 4773, + "task_loss": 2.7602505683898926 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9666788624113993, + "compression/movement_sparsity/importance_threshold": -0.00023337241155766634, + "compression/movement_sparsity/linear_layer_sparsity": 0.9021789222307695, + "compression/movement_sparsity/model_sparsity": 0.8711863008251466, + "compression_loss": 102.66424560546875, + "distillation_loss": 6.017032146453857, + "epoch": 4.04, + "learning_rate": 3.3136094674556215e-05, + "loss": 107.2348, + "step": 4774, + "task_loss": 2.9733078479766846 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.966766242972394, + "compression/movement_sparsity/importance_threshold": -0.00023276042128007272, + "compression/movement_sparsity/linear_layer_sparsity": 0.9022319370800785, + "compression/movement_sparsity/model_sparsity": 0.8712374944512892, + "compression_loss": 102.67283630371094, + "distillation_loss": 3.5754313468933105, + "epoch": 4.04, + "learning_rate": 3.31313985160139e-05, + "loss": 106.1666, + "step": 4775, + "task_loss": 1.8658751249313354 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.966853470636558, + "compression/movement_sparsity/importance_threshold": -0.00023214950185133573, + "compression/movement_sparsity/linear_layer_sparsity": 0.9022712987574445, + "compression/movement_sparsity/model_sparsity": 0.8712755039339469, + "compression_loss": 102.68152618408203, + "distillation_loss": 4.286721229553223, + "epoch": 4.04, + "learning_rate": 3.312670235747159e-05, + "loss": 106.8369, + "step": 4776, + "task_loss": 2.066082000732422 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9669405455377766, + "compression/movement_sparsity/importance_threshold": -0.00023153965233375842, + "compression/movement_sparsity/linear_layer_sparsity": 0.902301538446569, + "compression/movement_sparsity/model_sparsity": 0.8713047047967218, + "compression_loss": 102.69014739990234, + "distillation_loss": 5.230113983154297, + "epoch": 4.04, + "learning_rate": 3.312200619892928e-05, + "loss": 106.7509, + "step": 4777, + "task_loss": 3.0286102294921875 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9670274678099349, + "compression/movement_sparsity/importance_threshold": -0.00023093087178964553, + "compression/movement_sparsity/linear_layer_sparsity": 0.9023404947022354, + "compression/movement_sparsity/model_sparsity": 0.8713423227851627, + "compression_loss": 102.69873809814453, + "distillation_loss": 4.976161956787109, + "epoch": 4.04, + "learning_rate": 3.311731004038697e-05, + "loss": 106.9686, + "step": 4778, + "task_loss": 2.811161994934082 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9671142375869181, + "compression/movement_sparsity/importance_threshold": -0.00023032315928130011, + "compression/movement_sparsity/linear_layer_sparsity": 0.9023590129345739, + "compression/movement_sparsity/model_sparsity": 0.8713602048592516, + "compression_loss": 102.70729064941406, + "distillation_loss": 5.952496528625488, + "epoch": 4.04, + "learning_rate": 3.3112613881844654e-05, + "loss": 107.5295, + "step": 4779, + "task_loss": 2.136298656463623 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9672008550026112, + "compression/movement_sparsity/importance_threshold": -0.00022971651387102693, + "compression/movement_sparsity/linear_layer_sparsity": 0.9023332209599775, + "compression/movement_sparsity/model_sparsity": 0.871335298918328, + "compression_loss": 102.71589660644531, + "distillation_loss": 2.958289384841919, + "epoch": 4.04, + "learning_rate": 3.310791772330234e-05, + "loss": 106.7317, + "step": 4780, + "task_loss": 1.438698649406433 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9672873201908995, + "compression/movement_sparsity/importance_threshold": -0.00022911093462113074, + "compression/movement_sparsity/linear_layer_sparsity": 0.902385925780928, + "compression/movement_sparsity/model_sparsity": 0.8713861931665399, + "compression_loss": 102.72441864013672, + "distillation_loss": 4.869726657867432, + "epoch": 4.04, + "learning_rate": 3.3103221564760027e-05, + "loss": 107.1402, + "step": 4781, + "task_loss": 3.061699390411377 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9673736332856679, + "compression/movement_sparsity/importance_threshold": -0.0002285064205939137, + "compression/movement_sparsity/linear_layer_sparsity": 0.902467499011725, + "compression/movement_sparsity/model_sparsity": 0.8714649641059102, + "compression_loss": 102.7329330444336, + "distillation_loss": 5.688161849975586, + "epoch": 4.04, + "learning_rate": 3.309852540621772e-05, + "loss": 107.5958, + "step": 4782, + "task_loss": 2.566164493560791 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9674597944208019, + "compression/movement_sparsity/importance_threshold": -0.0002279029708516806, + "compression/movement_sparsity/linear_layer_sparsity": 0.9025167577482287, + "compression/movement_sparsity/model_sparsity": 0.8715125306532775, + "compression_loss": 102.74149322509766, + "distillation_loss": 4.158825874328613, + "epoch": 4.04, + "learning_rate": 3.30938292476754e-05, + "loss": 106.5877, + "step": 4783, + "task_loss": 2.038264274597168 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9675458037301863, + "compression/movement_sparsity/importance_threshold": -0.0002273005844567362, + "compression/movement_sparsity/linear_layer_sparsity": 0.9025653845038478, + "compression/movement_sparsity/model_sparsity": 0.8715594869302477, + "compression_loss": 102.74993896484375, + "distillation_loss": 3.600076198577881, + "epoch": 4.04, + "learning_rate": 3.308913308913309e-05, + "loss": 107.1283, + "step": 4784, + "task_loss": 2.175241470336914 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9676316613477064, + "compression/movement_sparsity/importance_threshold": -0.00022669926047138352, + "compression/movement_sparsity/linear_layer_sparsity": 0.9026348427803267, + "compression/movement_sparsity/model_sparsity": 0.8716265591012509, + "compression_loss": 102.75843811035156, + "distillation_loss": 5.354538917541504, + "epoch": 4.04, + "learning_rate": 3.308443693059078e-05, + "loss": 107.4868, + "step": 4785, + "task_loss": 3.5572593212127686 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9677173674072472, + "compression/movement_sparsity/importance_threshold": -0.00022609899795792733, + "compression/movement_sparsity/linear_layer_sparsity": 0.9026958587461194, + "compression/movement_sparsity/model_sparsity": 0.8716854789809115, + "compression_loss": 102.76697540283203, + "distillation_loss": 4.784909725189209, + "epoch": 4.05, + "learning_rate": 3.3079740772048465e-05, + "loss": 107.4464, + "step": 4786, + "task_loss": 3.323976516723633 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9678029220426941, + "compression/movement_sparsity/importance_threshold": -0.00022549979597867066, + "compression/movement_sparsity/linear_layer_sparsity": 0.9026935096850951, + "compression/movement_sparsity/model_sparsity": 0.87168321061736, + "compression_loss": 102.7754898071289, + "distillation_loss": 5.045825958251953, + "epoch": 4.05, + "learning_rate": 3.307504461350615e-05, + "loss": 107.4556, + "step": 4787, + "task_loss": 2.6036953926086426 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9678883253879321, + "compression/movement_sparsity/importance_threshold": -0.0002249016535959174, + "compression/movement_sparsity/linear_layer_sparsity": 0.9026732743726171, + "compression/movement_sparsity/model_sparsity": 0.8716636704501167, + "compression_loss": 102.78396606445312, + "distillation_loss": 4.173159122467041, + "epoch": 4.05, + "learning_rate": 3.307034845496384e-05, + "loss": 106.6053, + "step": 4788, + "task_loss": 2.4369330406188965 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9679735775768464, + "compression/movement_sparsity/importance_threshold": -0.00022430456987197235, + "compression/movement_sparsity/linear_layer_sparsity": 0.9027439608383625, + "compression/movement_sparsity/model_sparsity": 0.8717319286183067, + "compression_loss": 102.79241180419922, + "distillation_loss": 5.7845306396484375, + "epoch": 4.05, + "learning_rate": 3.306565229642153e-05, + "loss": 107.5709, + "step": 4789, + "task_loss": 3.185696601867676 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.968058678743322, + "compression/movement_sparsity/importance_threshold": -0.00022370854386913937, + "compression/movement_sparsity/linear_layer_sparsity": 0.9028109150396378, + "compression/movement_sparsity/model_sparsity": 0.8717965827367931, + "compression_loss": 102.80086517333984, + "distillation_loss": 4.4043989181518555, + "epoch": 4.05, + "learning_rate": 3.306095613787922e-05, + "loss": 107.7359, + "step": 4790, + "task_loss": 2.699735164642334 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9681436290212441, + "compression/movement_sparsity/importance_threshold": -0.00022311357464972238, + "compression/movement_sparsity/linear_layer_sparsity": 0.9028552371707403, + "compression/movement_sparsity/model_sparsity": 0.8718393822663414, + "compression_loss": 102.80928039550781, + "distillation_loss": 4.339084625244141, + "epoch": 4.05, + "learning_rate": 3.3056259979336904e-05, + "loss": 106.6341, + "step": 4791, + "task_loss": 2.271944761276245 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.968228428544498, + "compression/movement_sparsity/importance_threshold": -0.0002225196612760244, + "compression/movement_sparsity/linear_layer_sparsity": 0.902963150887845, + "compression/movement_sparsity/model_sparsity": 0.871943588815282, + "compression_loss": 102.81771850585938, + "distillation_loss": 4.447133541107178, + "epoch": 4.05, + "learning_rate": 3.305156382079459e-05, + "loss": 107.3625, + "step": 4792, + "task_loss": 2.0206549167633057 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9683130774469686, + "compression/movement_sparsity/importance_threshold": -0.00022192680281035108, + "compression/movement_sparsity/linear_layer_sparsity": 0.9030550742961497, + "compression/movement_sparsity/model_sparsity": 0.8720323543717221, + "compression_loss": 102.82608795166016, + "distillation_loss": 3.582505941390991, + "epoch": 4.05, + "learning_rate": 3.3046867662252276e-05, + "loss": 107.0794, + "step": 4793, + "task_loss": 2.677927255630493 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.968397575862541, + "compression/movement_sparsity/importance_threshold": -0.00022133499831500543, + "compression/movement_sparsity/linear_layer_sparsity": 0.9030402167832755, + "compression/movement_sparsity/model_sparsity": 0.8720180072601221, + "compression_loss": 102.83450317382812, + "distillation_loss": 4.180645942687988, + "epoch": 4.05, + "learning_rate": 3.304217150370997e-05, + "loss": 107.4208, + "step": 4794, + "task_loss": 2.0192930698394775 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9684819239251007, + "compression/movement_sparsity/importance_threshold": -0.00022074424685229223, + "compression/movement_sparsity/linear_layer_sparsity": 0.9031129661300219, + "compression/movement_sparsity/model_sparsity": 0.8720882574430046, + "compression_loss": 102.84291076660156, + "distillation_loss": 4.118429183959961, + "epoch": 4.05, + "learning_rate": 3.3037475345167656e-05, + "loss": 106.823, + "step": 4795, + "task_loss": 1.6211737394332886 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9685661217685325, + "compression/movement_sparsity/importance_threshold": -0.00022015454748451364, + "compression/movement_sparsity/linear_layer_sparsity": 0.9030674038854852, + "compression/movement_sparsity/model_sparsity": 0.8720442604017337, + "compression_loss": 102.85122680664062, + "distillation_loss": 4.612369537353516, + "epoch": 4.05, + "learning_rate": 3.303277918662534e-05, + "loss": 106.4381, + "step": 4796, + "task_loss": 3.711895227432251 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9686501695267218, + "compression/movement_sparsity/importance_threshold": -0.0002195658992739753, + "compression/movement_sparsity/linear_layer_sparsity": 0.9031891019403768, + "compression/movement_sparsity/model_sparsity": 0.8721617777540527, + "compression_loss": 102.85961151123047, + "distillation_loss": 4.436980724334717, + "epoch": 4.05, + "learning_rate": 3.302808302808303e-05, + "loss": 107.3193, + "step": 4797, + "task_loss": 2.0329346656799316 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9687340673335535, + "compression/movement_sparsity/importance_threshold": -0.00021897830128298022, + "compression/movement_sparsity/linear_layer_sparsity": 0.9032424148938768, + "compression/movement_sparsity/model_sparsity": 0.8722132592435901, + "compression_loss": 102.86788177490234, + "distillation_loss": 4.790655136108398, + "epoch": 4.06, + "learning_rate": 3.3023386869540715e-05, + "loss": 106.6703, + "step": 4798, + "task_loss": 1.9334609508514404 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9688178153229129, + "compression/movement_sparsity/importance_threshold": -0.00021839175257383318, + "compression/movement_sparsity/linear_layer_sparsity": 0.9032975045483546, + "compression/movement_sparsity/model_sparsity": 0.8722664563989608, + "compression_loss": 102.87620544433594, + "distillation_loss": 3.4879918098449707, + "epoch": 4.06, + "learning_rate": 3.301869071099841e-05, + "loss": 107.1401, + "step": 4799, + "task_loss": 2.313281536102295 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.968901413628685, + "compression/movement_sparsity/importance_threshold": -0.00021780625220883896, + "compression/movement_sparsity/linear_layer_sparsity": 0.9032866654799735, + "compression/movement_sparsity/model_sparsity": 0.8722559896859237, + "compression_loss": 102.88446044921875, + "distillation_loss": 4.432814121246338, + "epoch": 4.06, + "learning_rate": 3.301399455245609e-05, + "loss": 106.9459, + "step": 4800, + "task_loss": 2.8138625621795654 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9689848623847551, + "compression/movement_sparsity/importance_threshold": -0.00021722179925029884, + "compression/movement_sparsity/linear_layer_sparsity": 0.9033663308439487, + "compression/movement_sparsity/model_sparsity": 0.8723329182995669, + "compression_loss": 102.89271545410156, + "distillation_loss": 5.30959939956665, + "epoch": 4.06, + "learning_rate": 3.300929839391378e-05, + "loss": 107.7164, + "step": 4801, + "task_loss": 3.613698720932007 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9690681617250083, + "compression/movement_sparsity/importance_threshold": -0.00021663839276051845, + "compression/movement_sparsity/linear_layer_sparsity": 0.9033493269809, + "compression/movement_sparsity/model_sparsity": 0.8723164985715239, + "compression_loss": 102.90096282958984, + "distillation_loss": 3.8767526149749756, + "epoch": 4.06, + "learning_rate": 3.300460223537147e-05, + "loss": 107.8536, + "step": 4802, + "task_loss": 1.7695902585983276 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9691513117833296, + "compression/movement_sparsity/importance_threshold": -0.00021605603180180257, + "compression/movement_sparsity/linear_layer_sparsity": 0.9033686679808053, + "compression/movement_sparsity/model_sparsity": 0.8723351751485827, + "compression_loss": 102.90921783447266, + "distillation_loss": 3.9128403663635254, + "epoch": 4.06, + "learning_rate": 3.299990607682916e-05, + "loss": 107.1655, + "step": 4803, + "task_loss": 2.1121826171875 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9692343126936044, + "compression/movement_sparsity/importance_threshold": -0.00021547471543645336, + "compression/movement_sparsity/linear_layer_sparsity": 0.9034260351513015, + "compression/movement_sparsity/model_sparsity": 0.8723905715802902, + "compression_loss": 102.91742706298828, + "distillation_loss": 5.042122840881348, + "epoch": 4.06, + "learning_rate": 3.2995209918286846e-05, + "loss": 107.3597, + "step": 4804, + "task_loss": 2.3340232372283936 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9693171645897176, + "compression/movement_sparsity/importance_threshold": -0.00021489444272677558, + "compression/movement_sparsity/linear_layer_sparsity": 0.9034353598503927, + "compression/movement_sparsity/model_sparsity": 0.8723995759472816, + "compression_loss": 102.92564392089844, + "distillation_loss": 5.148585796356201, + "epoch": 4.06, + "learning_rate": 3.2990513759744526e-05, + "loss": 107.3898, + "step": 4805, + "task_loss": 2.1772398948669434 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9693998676055544, + "compression/movement_sparsity/importance_threshold": -0.00021431521273507486, + "compression/movement_sparsity/linear_layer_sparsity": 0.9034389728731864, + "compression/movement_sparsity/model_sparsity": 0.8724030648516273, + "compression_loss": 102.93388366699219, + "distillation_loss": 3.664132595062256, + "epoch": 4.06, + "learning_rate": 3.298581760120222e-05, + "loss": 107.2649, + "step": 4806, + "task_loss": 2.480259656906128 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.969482421875, + "compression/movement_sparsity/importance_threshold": -0.0002137370245236525, + "compression/movement_sparsity/linear_layer_sparsity": 0.90349721050792, + "compression/movement_sparsity/model_sparsity": 0.8724593018444479, + "compression_loss": 102.94212341308594, + "distillation_loss": 4.195376873016357, + "epoch": 4.06, + "learning_rate": 3.2981121442659905e-05, + "loss": 106.6227, + "step": 4807, + "task_loss": 1.8583990335464478 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9695648275319395, + "compression/movement_sparsity/importance_threshold": -0.00021315987715481415, + "compression/movement_sparsity/linear_layer_sparsity": 0.9034363376321388, + "compression/movement_sparsity/model_sparsity": 0.8724005201392168, + "compression_loss": 102.9503173828125, + "distillation_loss": 4.824770927429199, + "epoch": 4.06, + "learning_rate": 3.29764252841176e-05, + "loss": 107.5201, + "step": 4808, + "task_loss": 2.253376007080078 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9696470847102581, + "compression/movement_sparsity/importance_threshold": -0.00021258376969086282, + "compression/movement_sparsity/linear_layer_sparsity": 0.9034531030118349, + "compression/movement_sparsity/model_sparsity": 0.8724167095765438, + "compression_loss": 102.95848083496094, + "distillation_loss": 4.4920525550842285, + "epoch": 4.07, + "learning_rate": 3.297172912557528e-05, + "loss": 107.4236, + "step": 4809, + "task_loss": 3.5478129386901855 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9697291935438409, + "compression/movement_sparsity/importance_threshold": -0.0002120087011941033, + "compression/movement_sparsity/linear_layer_sparsity": 0.9035288930213284, + "compression/movement_sparsity/model_sparsity": 0.872489895966054, + "compression_loss": 102.96662902832031, + "distillation_loss": 4.278686046600342, + "epoch": 4.07, + "learning_rate": 3.296703296703297e-05, + "loss": 107.5704, + "step": 4810, + "task_loss": 2.250063419342041 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.969811154166573, + "compression/movement_sparsity/importance_threshold": -0.00021143467072683858, + "compression/movement_sparsity/linear_layer_sparsity": 0.9036065312768056, + "compression/movement_sparsity/model_sparsity": 0.8725648671086123, + "compression_loss": 102.97474670410156, + "distillation_loss": 4.549725532531738, + "epoch": 4.07, + "learning_rate": 3.296233680849066e-05, + "loss": 106.9804, + "step": 4811, + "task_loss": 2.4518086910247803 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9698929667123396, + "compression/movement_sparsity/importance_threshold": -0.00021086167735137346, + "compression/movement_sparsity/linear_layer_sparsity": 0.9036753217998968, + "compression/movement_sparsity/model_sparsity": 0.8726312944656109, + "compression_loss": 102.98284912109375, + "distillation_loss": 4.317776679992676, + "epoch": 4.07, + "learning_rate": 3.2957640649948344e-05, + "loss": 107.0914, + "step": 4812, + "task_loss": 2.2881648540496826 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9699746313150257, + "compression/movement_sparsity/importance_threshold": -0.0002102897201300127, + "compression/movement_sparsity/linear_layer_sparsity": 0.9036953663256927, + "compression/movement_sparsity/model_sparsity": 0.8726506504002816, + "compression_loss": 102.9909439086914, + "distillation_loss": 6.259421348571777, + "epoch": 4.07, + "learning_rate": 3.295294449140603e-05, + "loss": 107.7264, + "step": 4813, + "task_loss": 3.3878540992736816 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9700561481085167, + "compression/movement_sparsity/importance_threshold": -0.00020971879812505846, + "compression/movement_sparsity/linear_layer_sparsity": 0.9037732907611931, + "compression/movement_sparsity/model_sparsity": 0.8727258978916989, + "compression_loss": 102.99905395507812, + "distillation_loss": 3.979527473449707, + "epoch": 4.07, + "learning_rate": 3.2948248332863716e-05, + "loss": 107.2432, + "step": 4814, + "task_loss": 1.8918050527572632 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9701375172266976, + "compression/movement_sparsity/importance_threshold": -0.0002091489103988155, + "compression/movement_sparsity/linear_layer_sparsity": 0.903816945138908, + "compression/movement_sparsity/model_sparsity": 0.8727680526072427, + "compression_loss": 103.00713348388672, + "distillation_loss": 4.389825344085693, + "epoch": 4.07, + "learning_rate": 3.294355217432141e-05, + "loss": 106.7897, + "step": 4815, + "task_loss": 1.8579771518707275 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9702187388034534, + "compression/movement_sparsity/importance_threshold": -0.0002085800560135886, + "compression/movement_sparsity/linear_layer_sparsity": 0.9038467555579976, + "compression/movement_sparsity/model_sparsity": 0.872796838946729, + "compression_loss": 103.01522064208984, + "distillation_loss": 4.505827903747559, + "epoch": 4.07, + "learning_rate": 3.2938856015779096e-05, + "loss": 107.0756, + "step": 4816, + "task_loss": 1.6593960523605347 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9702998129726695, + "compression/movement_sparsity/importance_threshold": -0.0002080122340316808, + "compression/movement_sparsity/linear_layer_sparsity": 0.9039200057197847, + "compression/movement_sparsity/model_sparsity": 0.8728675727401148, + "compression_loss": 103.02324676513672, + "distillation_loss": 4.635760307312012, + "epoch": 4.07, + "learning_rate": 3.293415985723678e-05, + "loss": 107.7039, + "step": 4817, + "task_loss": 2.437514305114746 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9703807398682309, + "compression/movement_sparsity/importance_threshold": -0.00020744544351539596, + "compression/movement_sparsity/linear_layer_sparsity": 0.9039227244300057, + "compression/movement_sparsity/model_sparsity": 0.872870198054276, + "compression_loss": 103.03123474121094, + "distillation_loss": 3.671464681625366, + "epoch": 4.07, + "learning_rate": 3.292946369869447e-05, + "loss": 107.456, + "step": 4818, + "task_loss": 1.8804922103881836 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9704615196240227, + "compression/movement_sparsity/importance_threshold": -0.00020687968352703888, + "compression/movement_sparsity/linear_layer_sparsity": 0.903940038321413, + "compression/movement_sparsity/model_sparsity": 0.8728869171602497, + "compression_loss": 103.03926086425781, + "distillation_loss": 2.5917115211486816, + "epoch": 4.07, + "learning_rate": 3.2924767540152155e-05, + "loss": 106.2955, + "step": 4819, + "task_loss": 2.0098533630371094 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9705421523739303, + "compression/movement_sparsity/importance_threshold": -0.0002063149531289117, + "compression/movement_sparsity/linear_layer_sparsity": 0.9039983832736552, + "compression/movement_sparsity/model_sparsity": 0.8729432577838924, + "compression_loss": 103.04725646972656, + "distillation_loss": 3.6401312351226807, + "epoch": 4.07, + "learning_rate": 3.292007138160985e-05, + "loss": 106.5979, + "step": 4820, + "task_loss": 2.5357186794281006 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9706226382518384, + "compression/movement_sparsity/importance_threshold": -0.00020575125138332182, + "compression/movement_sparsity/linear_layer_sparsity": 0.9040809223620307, + "compression/movement_sparsity/model_sparsity": 0.8730229614006623, + "compression_loss": 103.05523681640625, + "distillation_loss": 4.618346214294434, + "epoch": 4.08, + "learning_rate": 3.2915375223067534e-05, + "loss": 108.0826, + "step": 4821, + "task_loss": 2.9333016872406006 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9707029773916325, + "compression/movement_sparsity/importance_threshold": -0.00020518857735256964, + "compression/movement_sparsity/linear_layer_sparsity": 0.9042031093077953, + "compression/movement_sparsity/model_sparsity": 0.8731409508489488, + "compression_loss": 103.06315612792969, + "distillation_loss": 3.853951930999756, + "epoch": 4.08, + "learning_rate": 3.291067906452522e-05, + "loss": 107.264, + "step": 4822, + "task_loss": 2.2856199741363525 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9707831699271976, + "compression/movement_sparsity/importance_threshold": -0.0002046269300989608, + "compression/movement_sparsity/linear_layer_sparsity": 0.9042424590609938, + "compression/movement_sparsity/model_sparsity": 0.8731789488170708, + "compression_loss": 103.07108306884766, + "distillation_loss": 3.9655282497406006, + "epoch": 4.08, + "learning_rate": 3.290598290598291e-05, + "loss": 107.113, + "step": 4823, + "task_loss": 2.3247575759887695 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9708632159924188, + "compression/movement_sparsity/importance_threshold": -0.0002040663086848001, + "compression/movement_sparsity/linear_layer_sparsity": 0.9042813914683249, + "compression/movement_sparsity/model_sparsity": 0.8732165437764401, + "compression_loss": 103.07894897460938, + "distillation_loss": 5.123559951782227, + "epoch": 4.08, + "learning_rate": 3.290128674744059e-05, + "loss": 107.5273, + "step": 4824, + "task_loss": 2.219270706176758 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9709431157211814, + "compression/movement_sparsity/importance_threshold": -0.00020350671217238964, + "compression/movement_sparsity/linear_layer_sparsity": 0.9042884744239005, + "compression/movement_sparsity/model_sparsity": 0.873223383410702, + "compression_loss": 103.08683776855469, + "distillation_loss": 5.56449031829834, + "epoch": 4.08, + "learning_rate": 3.2896590588898286e-05, + "loss": 107.6456, + "step": 4825, + "task_loss": 3.9072721004486084 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9710228692473704, + "compression/movement_sparsity/importance_threshold": -0.0002029481396240351, + "compression/movement_sparsity/linear_layer_sparsity": 0.9043625712015898, + "compression/movement_sparsity/model_sparsity": 0.8732949347361293, + "compression_loss": 103.0947265625, + "distillation_loss": 6.15261173248291, + "epoch": 4.08, + "learning_rate": 3.2891894430355966e-05, + "loss": 107.5957, + "step": 4826, + "task_loss": 2.930346965789795 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.971102476704871, + "compression/movement_sparsity/importance_threshold": -0.00020239059010203776, + "compression/movement_sparsity/linear_layer_sparsity": 0.9044721185296605, + "compression/movement_sparsity/model_sparsity": 0.8734007187764737, + "compression_loss": 103.10258483886719, + "distillation_loss": 4.318051815032959, + "epoch": 4.08, + "learning_rate": 3.288719827181366e-05, + "loss": 107.6706, + "step": 4827, + "task_loss": 1.484450340270996 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9711819382275683, + "compression/movement_sparsity/importance_threshold": -0.00020183406266870586, + "compression/movement_sparsity/linear_layer_sparsity": 0.9045737601345886, + "compression/movement_sparsity/model_sparsity": 0.8734988686795864, + "compression_loss": 103.11041259765625, + "distillation_loss": 3.511653423309326, + "epoch": 4.08, + "learning_rate": 3.2882502113271346e-05, + "loss": 107.3884, + "step": 4828, + "task_loss": 2.230194330215454 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9712612539493475, + "compression/movement_sparsity/importance_threshold": -0.00020127855638633983, + "compression/movement_sparsity/linear_layer_sparsity": 0.904613729944504, + "compression/movement_sparsity/model_sparsity": 0.8735374654035697, + "compression_loss": 103.11822509765625, + "distillation_loss": 3.714754581451416, + "epoch": 4.08, + "learning_rate": 3.287780595472903e-05, + "loss": 107.136, + "step": 4829, + "task_loss": 2.2605957984924316 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9713404240040936, + "compression/movement_sparsity/importance_threshold": -0.0002007240703172453, + "compression/movement_sparsity/linear_layer_sparsity": 0.9046947427394221, + "compression/movement_sparsity/model_sparsity": 0.8736156951597578, + "compression_loss": 103.12602233886719, + "distillation_loss": 3.527259111404419, + "epoch": 4.08, + "learning_rate": 3.287310979618672e-05, + "loss": 107.921, + "step": 4830, + "task_loss": 2.270867109298706 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.971419448525692, + "compression/movement_sparsity/importance_threshold": -0.0002001706035237253, + "compression/movement_sparsity/linear_layer_sparsity": 0.9047635332625134, + "compression/movement_sparsity/model_sparsity": 0.8736821225167565, + "compression_loss": 103.13387298583984, + "distillation_loss": 4.302008628845215, + "epoch": 4.08, + "learning_rate": 3.2868413637644405e-05, + "loss": 107.1231, + "step": 4831, + "task_loss": 2.0838630199432373 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9714983276480276, + "compression/movement_sparsity/importance_threshold": -0.0001996181550680846, + "compression/movement_sparsity/linear_layer_sparsity": 0.9048708149987332, + "compression/movement_sparsity/model_sparsity": 0.8737857187953, + "compression_loss": 103.14170837402344, + "distillation_loss": 4.88677978515625, + "epoch": 4.08, + "learning_rate": 3.28637174791021e-05, + "loss": 107.0759, + "step": 4832, + "task_loss": 2.2611024379730225 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9715770615049857, + "compression/movement_sparsity/importance_threshold": -0.00019906672401262624, + "compression/movement_sparsity/linear_layer_sparsity": 0.9048633623939608, + "compression/movement_sparsity/model_sparsity": 0.8737785222104284, + "compression_loss": 103.14945983886719, + "distillation_loss": 4.021677017211914, + "epoch": 4.09, + "learning_rate": 3.2859021320559784e-05, + "loss": 107.8425, + "step": 4833, + "task_loss": 2.316464900970459 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9716556502304512, + "compression/movement_sparsity/importance_threshold": -0.0001985163094196567, + "compression/movement_sparsity/linear_layer_sparsity": 0.9049988805591423, + "compression/movement_sparsity/model_sparsity": 0.8739093849097334, + "compression_loss": 103.15727996826172, + "distillation_loss": 5.544647216796875, + "epoch": 4.09, + "learning_rate": 3.285432516201747e-05, + "loss": 107.9736, + "step": 4834, + "task_loss": 2.5688581466674805 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9717340939583096, + "compression/movement_sparsity/importance_threshold": -0.00019796691035147643, + "compression/movement_sparsity/linear_layer_sparsity": 0.9049906767318089, + "compression/movement_sparsity/model_sparsity": 0.8739014629091068, + "compression_loss": 103.1650161743164, + "distillation_loss": 6.0070343017578125, + "epoch": 4.09, + "learning_rate": 3.284962900347516e-05, + "loss": 107.7886, + "step": 4835, + "task_loss": 3.17558217048645 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9718123928224458, + "compression/movement_sparsity/importance_threshold": -0.00019741852587039106, + "compression/movement_sparsity/linear_layer_sparsity": 0.9051158804919854, + "compression/movement_sparsity/model_sparsity": 0.8740223655349494, + "compression_loss": 103.17269134521484, + "distillation_loss": 2.9787697792053223, + "epoch": 4.09, + "learning_rate": 3.284493284493284e-05, + "loss": 106.935, + "step": 4836, + "task_loss": 2.970285415649414 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9718905469567449, + "compression/movement_sparsity/importance_threshold": -0.00019687115503870534, + "compression/movement_sparsity/linear_layer_sparsity": 0.9051511044831818, + "compression/movement_sparsity/model_sparsity": 0.8740563794736864, + "compression_loss": 103.18045043945312, + "distillation_loss": 4.196341514587402, + "epoch": 4.09, + "learning_rate": 3.2840236686390536e-05, + "loss": 107.6158, + "step": 4837, + "task_loss": 2.448638916015625 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9719685564950922, + "compression/movement_sparsity/importance_threshold": -0.0001963247969187223, + "compression/movement_sparsity/linear_layer_sparsity": 0.9051163097620203, + "compression/movement_sparsity/model_sparsity": 0.874022780058238, + "compression_loss": 103.18806457519531, + "distillation_loss": 4.77727746963501, + "epoch": 4.09, + "learning_rate": 3.283554052784822e-05, + "loss": 107.6622, + "step": 4838, + "task_loss": 3.03879451751709 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9720464215713728, + "compression/movement_sparsity/importance_threshold": -0.00019577945057274587, + "compression/movement_sparsity/linear_layer_sparsity": 0.9050762922554344, + "compression/movement_sparsity/model_sparsity": 0.8739841372761115, + "compression_loss": 103.19576263427734, + "distillation_loss": 5.401034355163574, + "epoch": 4.09, + "learning_rate": 3.283084436930591e-05, + "loss": 107.5658, + "step": 4839, + "task_loss": 3.071499824523926 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9721241423194718, + "compression/movement_sparsity/importance_threshold": -0.00019523511506307992, + "compression/movement_sparsity/linear_layer_sparsity": 0.9050918294458639, + "compression/movement_sparsity/model_sparsity": 0.8739991407162518, + "compression_loss": 103.2033462524414, + "distillation_loss": 4.5809197425842285, + "epoch": 4.09, + "learning_rate": 3.2826148210763595e-05, + "loss": 107.6824, + "step": 4840, + "task_loss": 2.493220567703247 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9722017188732743, + "compression/movement_sparsity/importance_threshold": -0.00019469178945202923, + "compression/movement_sparsity/linear_layer_sparsity": 0.9051056972528244, + "compression/movement_sparsity/model_sparsity": 0.8740125321213809, + "compression_loss": 103.2109603881836, + "distillation_loss": 5.460968017578125, + "epoch": 4.09, + "learning_rate": 3.282145205222129e-05, + "loss": 107.9276, + "step": 4841, + "task_loss": 2.878288745880127 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9722791513666655, + "compression/movement_sparsity/importance_threshold": -0.0001941494728018977, + "compression/movement_sparsity/linear_layer_sparsity": 0.9051503174881178, + "compression/movement_sparsity/model_sparsity": 0.874055619514324, + "compression_loss": 103.21862030029297, + "distillation_loss": 3.1633620262145996, + "epoch": 4.09, + "learning_rate": 3.2816755893678975e-05, + "loss": 107.3947, + "step": 4842, + "task_loss": 1.761926531791687 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9723564399335306, + "compression/movement_sparsity/importance_threshold": -0.00019360816417498834, + "compression/movement_sparsity/linear_layer_sparsity": 0.9051570307944968, + "compression/movement_sparsity/model_sparsity": 0.8740621021979763, + "compression_loss": 103.22618865966797, + "distillation_loss": 4.061526775360107, + "epoch": 4.09, + "learning_rate": 3.281205973513666e-05, + "loss": 107.2181, + "step": 4843, + "task_loss": 2.8614678382873535 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9724335847077547, + "compression/movement_sparsity/importance_threshold": -0.00019306786263360593, + "compression/movement_sparsity/linear_layer_sparsity": 0.9052637520948378, + "compression/movement_sparsity/model_sparsity": 0.8741651572933374, + "compression_loss": 103.23377990722656, + "distillation_loss": 3.236111879348755, + "epoch": 4.09, + "learning_rate": 3.280736357659435e-05, + "loss": 107.4518, + "step": 4844, + "task_loss": 1.4589552879333496 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9725105858232228, + "compression/movement_sparsity/importance_threshold": -0.00019252856724005438, + "compression/movement_sparsity/linear_layer_sparsity": 0.9053354878873351, + "compression/movement_sparsity/model_sparsity": 0.8742344287406775, + "compression_loss": 103.2413330078125, + "distillation_loss": 2.675389289855957, + "epoch": 4.1, + "learning_rate": 3.2802667418052034e-05, + "loss": 106.5977, + "step": 4845, + "task_loss": 1.2069915533065796 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9725874434138202, + "compression/movement_sparsity/importance_threshold": -0.00019199027705663757, + "compression/movement_sparsity/linear_layer_sparsity": 0.9053688159358774, + "compression/movement_sparsity/model_sparsity": 0.8742666118682232, + "compression_loss": 103.24886322021484, + "distillation_loss": 3.2741708755493164, + "epoch": 4.1, + "learning_rate": 3.279797125950973e-05, + "loss": 107.7795, + "step": 4846, + "task_loss": 3.3193116188049316 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9726641576134321, + "compression/movement_sparsity/importance_threshold": -0.00019145299114565854, + "compression/movement_sparsity/linear_layer_sparsity": 0.905392700043652, + "compression/movement_sparsity/model_sparsity": 0.8742896754834196, + "compression_loss": 103.25633239746094, + "distillation_loss": 6.7563862800598145, + "epoch": 4.1, + "learning_rate": 3.2793275100967406e-05, + "loss": 108.3913, + "step": 4847, + "task_loss": 4.259468078613281 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9727407285559434, + "compression/movement_sparsity/importance_threshold": -0.00019091670856942293, + "compression/movement_sparsity/linear_layer_sparsity": 0.9054963210604077, + "compression/movement_sparsity/model_sparsity": 0.8743897367994742, + "compression_loss": 103.26383972167969, + "distillation_loss": 5.104982376098633, + "epoch": 4.1, + "learning_rate": 3.27885789424251e-05, + "loss": 107.8564, + "step": 4848, + "task_loss": 2.2054660320281982 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9728171563752394, + "compression/movement_sparsity/importance_threshold": -0.00019038142839023463, + "compression/movement_sparsity/linear_layer_sparsity": 0.9055548448751645, + "compression/movement_sparsity/model_sparsity": 0.8744462501411538, + "compression_loss": 103.2713623046875, + "distillation_loss": 5.091462135314941, + "epoch": 4.1, + "learning_rate": 3.2783882783882786e-05, + "loss": 107.9965, + "step": 4849, + "task_loss": 2.056246280670166 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9728934412052053, + "compression/movement_sparsity/importance_threshold": -0.0001898471496703958, + "compression/movement_sparsity/linear_layer_sparsity": 0.9055949220025886, + "compression/movement_sparsity/model_sparsity": 0.8744849504959593, + "compression_loss": 103.27880096435547, + "distillation_loss": 3.1431407928466797, + "epoch": 4.1, + "learning_rate": 3.277918662534047e-05, + "loss": 107.3283, + "step": 4850, + "task_loss": 2.1383557319641113 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.972969583179726, + "compression/movement_sparsity/importance_threshold": -0.00018931387147221208, + "compression/movement_sparsity/linear_layer_sparsity": 0.9056764356125474, + "compression/movement_sparsity/model_sparsity": 0.8745636638626507, + "compression_loss": 103.28632354736328, + "distillation_loss": 3.8292155265808105, + "epoch": 4.1, + "learning_rate": 3.2774490466798165e-05, + "loss": 107.2169, + "step": 4851, + "task_loss": 1.37556791305542 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9730455824326869, + "compression/movement_sparsity/importance_threshold": -0.00018878159285798737, + "compression/movement_sparsity/linear_layer_sparsity": 0.9057316206603662, + "compression/movement_sparsity/model_sparsity": 0.8746169531343079, + "compression_loss": 103.2937240600586, + "distillation_loss": 3.828979969024658, + "epoch": 4.1, + "learning_rate": 3.2769794308255845e-05, + "loss": 107.5758, + "step": 4852, + "task_loss": 2.2090728282928467 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.973121439097973, + "compression/movement_sparsity/importance_threshold": -0.0001882503128900247, + "compression/movement_sparsity/linear_layer_sparsity": 0.9056524918839345, + "compression/movement_sparsity/model_sparsity": 0.8745405426747753, + "compression_loss": 103.30120849609375, + "distillation_loss": 4.179638862609863, + "epoch": 4.1, + "learning_rate": 3.276509814971354e-05, + "loss": 108.3649, + "step": 4853, + "task_loss": 2.1499128341674805 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9731971533094697, + "compression/movement_sparsity/importance_threshold": -0.00018772003063062797, + "compression/movement_sparsity/linear_layer_sparsity": 0.9057002004786456, + "compression/movement_sparsity/model_sparsity": 0.8745866123324892, + "compression_loss": 103.30866241455078, + "distillation_loss": 5.240311622619629, + "epoch": 4.1, + "learning_rate": 3.2760401991171224e-05, + "loss": 107.6471, + "step": 4854, + "task_loss": 2.432844877243042 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9732727252010617, + "compression/movement_sparsity/importance_threshold": -0.0001871907451421028, + "compression/movement_sparsity/linear_layer_sparsity": 0.9057916111477422, + "compression/movement_sparsity/model_sparsity": 0.8746748827638903, + "compression_loss": 103.31600189208984, + "distillation_loss": 5.067829608917236, + "epoch": 4.1, + "learning_rate": 3.275570583262891e-05, + "loss": 107.9471, + "step": 4855, + "task_loss": 2.5357766151428223 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9733481549066343, + "compression/movement_sparsity/importance_threshold": -0.00018666245548675225, + "compression/movement_sparsity/linear_layer_sparsity": 0.9057904664276492, + "compression/movement_sparsity/model_sparsity": 0.874673777368454, + "compression_loss": 103.32350158691406, + "distillation_loss": 6.4371843338012695, + "epoch": 4.1, + "learning_rate": 3.27510096740866e-05, + "loss": 107.5209, + "step": 4856, + "task_loss": 3.6523308753967285 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9734234425600728, + "compression/movement_sparsity/importance_threshold": -0.00018613516072687932, + "compression/movement_sparsity/linear_layer_sparsity": 0.9058622856893199, + "compression/movement_sparsity/model_sparsity": 0.8747431294175445, + "compression_loss": 103.33088684082031, + "distillation_loss": 4.006731986999512, + "epoch": 4.11, + "learning_rate": 3.274631351554428e-05, + "loss": 107.9295, + "step": 4857, + "task_loss": 2.3039591312408447 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9734985882952623, + "compression/movement_sparsity/importance_threshold": -0.00018560885992478966, + "compression/movement_sparsity/linear_layer_sparsity": 0.9059462914503146, + "compression/movement_sparsity/model_sparsity": 0.874824249322217, + "compression_loss": 103.3382339477539, + "distillation_loss": 5.863892555236816, + "epoch": 4.11, + "learning_rate": 3.2741617357001976e-05, + "loss": 108.671, + "step": 4858, + "task_loss": 3.5703442096710205 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9735735922460879, + "compression/movement_sparsity/importance_threshold": -0.00018508355214278543, + "compression/movement_sparsity/linear_layer_sparsity": 0.9059895761788328, + "compression/movement_sparsity/model_sparsity": 0.8748660470871512, + "compression_loss": 103.34549713134766, + "distillation_loss": 4.333067893981934, + "epoch": 4.11, + "learning_rate": 3.273692119845966e-05, + "loss": 107.627, + "step": 4859, + "task_loss": 2.58748197555542 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9736484545464347, + "compression/movement_sparsity/importance_threshold": -0.0001845592364431714, + "compression/movement_sparsity/linear_layer_sparsity": 0.9059760780210689, + "compression/movement_sparsity/model_sparsity": 0.8748530126326317, + "compression_loss": 103.35281372070312, + "distillation_loss": 5.874433517456055, + "epoch": 4.11, + "learning_rate": 3.273222503991735e-05, + "loss": 107.8407, + "step": 4860, + "task_loss": 2.864546298980713 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9737231753301877, + "compression/movement_sparsity/importance_threshold": -0.0001840359118882532, + "compression/movement_sparsity/linear_layer_sparsity": 0.9060997435636204, + "compression/movement_sparsity/model_sparsity": 0.874972429883357, + "compression_loss": 103.36009216308594, + "distillation_loss": 5.162848472595215, + "epoch": 4.11, + "learning_rate": 3.2727528881375035e-05, + "loss": 107.5571, + "step": 4861, + "task_loss": 3.739835739135742 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9737977547312324, + "compression/movement_sparsity/importance_threshold": -0.00018351357754033213, + "compression/movement_sparsity/linear_layer_sparsity": 0.906137090056656, + "compression/movement_sparsity/model_sparsity": 0.8750084934094654, + "compression_loss": 103.36736297607422, + "distillation_loss": 5.618568420410156, + "epoch": 4.11, + "learning_rate": 3.272283272283272e-05, + "loss": 107.5981, + "step": 4862, + "task_loss": 3.8957080841064453 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9738721928834537, + "compression/movement_sparsity/importance_threshold": -0.00018299223246171382, + "compression/movement_sparsity/linear_layer_sparsity": 0.9061086270685093, + "compression/movement_sparsity/model_sparsity": 0.8749810082125239, + "compression_loss": 103.37456512451172, + "distillation_loss": 3.5987634658813477, + "epoch": 4.11, + "learning_rate": 3.2718136564290415e-05, + "loss": 107.6706, + "step": 4863, + "task_loss": 3.310443878173828 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9739464899207367, + "compression/movement_sparsity/importance_threshold": -0.00018247187571470216, + "compression/movement_sparsity/linear_layer_sparsity": 0.9061518879486922, + "compression/movement_sparsity/model_sparsity": 0.8750227829483864, + "compression_loss": 103.38175201416016, + "distillation_loss": 3.966580867767334, + "epoch": 4.11, + "learning_rate": 3.2713440405748094e-05, + "loss": 107.1039, + "step": 4864, + "task_loss": 3.165783166885376 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9740206459769667, + "compression/movement_sparsity/importance_threshold": -0.00018195250636160107, + "compression/movement_sparsity/linear_layer_sparsity": 0.9060907885137259, + "compression/movement_sparsity/model_sparsity": 0.8749637824669753, + "compression_loss": 103.38908386230469, + "distillation_loss": 5.530395030975342, + "epoch": 4.11, + "learning_rate": 3.270874424720579e-05, + "loss": 108.2447, + "step": 4865, + "task_loss": 3.1670470237731934 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9740946611860288, + "compression/movement_sparsity/importance_threshold": -0.00018143412346471356, + "compression/movement_sparsity/linear_layer_sparsity": 0.9061336439722093, + "compression/movement_sparsity/model_sparsity": 0.8750051657086209, + "compression_loss": 103.39628601074219, + "distillation_loss": 4.051733016967773, + "epoch": 4.11, + "learning_rate": 3.2704048088663474e-05, + "loss": 107.5284, + "step": 4866, + "task_loss": 1.704953908920288 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9741685356818082, + "compression/movement_sparsity/importance_threshold": -0.00018091672608634354, + "compression/movement_sparsity/linear_layer_sparsity": 0.9061666739165606, + "compression/movement_sparsity/model_sparsity": 0.8750370609727717, + "compression_loss": 103.40347290039062, + "distillation_loss": 4.476462364196777, + "epoch": 4.11, + "learning_rate": 3.269935193012117e-05, + "loss": 107.9956, + "step": 4867, + "task_loss": 2.446237087249756 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9742422695981897, + "compression/movement_sparsity/importance_threshold": -0.0001804003132887975, + "compression/movement_sparsity/linear_layer_sparsity": 0.906196329321471, + "compression/movement_sparsity/model_sparsity": 0.8750656976232928, + "compression_loss": 103.41069793701172, + "distillation_loss": 5.960323810577393, + "epoch": 4.11, + "learning_rate": 3.269465577157885e-05, + "loss": 108.2353, + "step": 4868, + "task_loss": 3.4745163917541504 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9743158630690588, + "compression/movement_sparsity/importance_threshold": -0.00017988488413437677, + "compression/movement_sparsity/linear_layer_sparsity": 0.906242368532713, + "compression/movement_sparsity/model_sparsity": 0.8751101552459954, + "compression_loss": 103.41789245605469, + "distillation_loss": 4.863421440124512, + "epoch": 4.12, + "learning_rate": 3.268995961303653e-05, + "loss": 108.0017, + "step": 4869, + "task_loss": 1.4490094184875488 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9743893162283006, + "compression/movement_sparsity/importance_threshold": -0.00017937043768538607, + "compression/movement_sparsity/linear_layer_sparsity": 0.9062592173815825, + "compression/movement_sparsity/model_sparsity": 0.8751264252850731, + "compression_loss": 103.42511749267578, + "distillation_loss": 3.9651474952697754, + "epoch": 4.12, + "learning_rate": 3.2685263454494226e-05, + "loss": 106.965, + "step": 4870, + "task_loss": 2.2308053970336914 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9744626292098001, + "compression/movement_sparsity/importance_threshold": -0.0001788569730041302, + "compression/movement_sparsity/linear_layer_sparsity": 0.906255401647939, + "compression/movement_sparsity/model_sparsity": 0.8751227406336188, + "compression_loss": 103.43228912353516, + "distillation_loss": 4.086702346801758, + "epoch": 4.12, + "learning_rate": 3.268056729595191e-05, + "loss": 107.0643, + "step": 4871, + "task_loss": 2.674440383911133 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9745358021474425, + "compression/movement_sparsity/importance_threshold": -0.00017834448915291217, + "compression/movement_sparsity/linear_layer_sparsity": 0.9061897591051036, + "compression/movement_sparsity/model_sparsity": 0.8750593531140699, + "compression_loss": 103.43948364257812, + "distillation_loss": 6.035675048828125, + "epoch": 4.12, + "learning_rate": 3.2675871137409605e-05, + "loss": 108.6245, + "step": 4872, + "task_loss": 3.1565182209014893 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.974608835175113, + "compression/movement_sparsity/importance_threshold": -0.00017783298519403676, + "compression/movement_sparsity/linear_layer_sparsity": 0.9062934993635356, + "compression/movement_sparsity/model_sparsity": 0.8751595295754825, + "compression_loss": 103.44657897949219, + "distillation_loss": 4.713630676269531, + "epoch": 4.12, + "learning_rate": 3.2671174978867285e-05, + "loss": 107.5689, + "step": 4873, + "task_loss": 2.0402321815490723 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9746817284266966, + "compression/movement_sparsity/importance_threshold": -0.000177322460189807, + "compression/movement_sparsity/linear_layer_sparsity": 0.9063929230732835, + "compression/movement_sparsity/model_sparsity": 0.8752555377749374, + "compression_loss": 103.45375061035156, + "distillation_loss": 3.904275894165039, + "epoch": 4.12, + "learning_rate": 3.266647882032498e-05, + "loss": 107.3187, + "step": 4874, + "task_loss": 2.783564805984497 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9747544820360787, + "compression/movement_sparsity/importance_threshold": -0.00017681291320252677, + "compression/movement_sparsity/linear_layer_sparsity": 0.9064312354238975, + "compression/movement_sparsity/model_sparsity": 0.8752925339784452, + "compression_loss": 103.4608383178711, + "distillation_loss": 4.348126411437988, + "epoch": 4.12, + "learning_rate": 3.2661782661782664e-05, + "loss": 107.9443, + "step": 4875, + "task_loss": 2.3940958976745605 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9748270961371442, + "compression/movement_sparsity/importance_threshold": -0.00017630434329450086, + "compression/movement_sparsity/linear_layer_sparsity": 0.9064873863292947, + "compression/movement_sparsity/model_sparsity": 0.8753467559275017, + "compression_loss": 103.46794891357422, + "distillation_loss": 3.9418463706970215, + "epoch": 4.12, + "learning_rate": 3.265708650324035e-05, + "loss": 107.9741, + "step": 4876, + "task_loss": 1.3091652393341064 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9748995708637783, + "compression/movement_sparsity/importance_threshold": -0.00017579674952803315, + "compression/movement_sparsity/linear_layer_sparsity": 0.9065482949775787, + "compression/movement_sparsity/model_sparsity": 0.8754055721763402, + "compression_loss": 103.47506713867188, + "distillation_loss": 4.854995250701904, + "epoch": 4.12, + "learning_rate": 3.265239034469804e-05, + "loss": 107.8088, + "step": 4877, + "task_loss": 3.5786190032958984 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9749719063498662, + "compression/movement_sparsity/importance_threshold": -0.00017529013096542756, + "compression/movement_sparsity/linear_layer_sparsity": 0.9066035157979004, + "compression/movement_sparsity/model_sparsity": 0.8754588959916048, + "compression_loss": 103.48211669921875, + "distillation_loss": 6.065943717956543, + "epoch": 4.12, + "learning_rate": 3.2647694186155723e-05, + "loss": 108.0226, + "step": 4878, + "task_loss": 4.0540385246276855 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.975044102729293, + "compression/movement_sparsity/importance_threshold": -0.00017478448666898797, + "compression/movement_sparsity/linear_layer_sparsity": 0.9066356275813439, + "compression/movement_sparsity/model_sparsity": 0.8754899046364994, + "compression_loss": 103.48915100097656, + "distillation_loss": 4.033247470855713, + "epoch": 4.12, + "learning_rate": 3.2642998027613417e-05, + "loss": 107.1277, + "step": 4879, + "task_loss": 1.6673756837844849 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9751161601359437, + "compression/movement_sparsity/importance_threshold": -0.00017427981570101916, + "compression/movement_sparsity/linear_layer_sparsity": 0.9067174035229906, + "compression/movement_sparsity/model_sparsity": 0.8755688713229783, + "compression_loss": 103.49620819091797, + "distillation_loss": 5.9304914474487305, + "epoch": 4.13, + "learning_rate": 3.26383018690711e-05, + "loss": 107.9008, + "step": 4880, + "task_loss": 2.5920910835266113 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9751880787037037, + "compression/movement_sparsity/importance_threshold": -0.0001737761171238224, + "compression/movement_sparsity/linear_layer_sparsity": 0.9067533191159097, + "compression/movement_sparsity/model_sparsity": 0.8756035531047915, + "compression_loss": 103.50323486328125, + "distillation_loss": 6.632582664489746, + "epoch": 4.13, + "learning_rate": 3.263360571052879e-05, + "loss": 109.2561, + "step": 4881, + "task_loss": 2.338265895843506 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.975259858566458, + "compression/movement_sparsity/importance_threshold": -0.0001732733899997051, + "compression/movement_sparsity/linear_layer_sparsity": 0.9067915956940209, + "compression/movement_sparsity/model_sparsity": 0.875640514764692, + "compression_loss": 103.51026916503906, + "distillation_loss": 3.9054248332977295, + "epoch": 4.13, + "learning_rate": 3.2628909551986476e-05, + "loss": 108.0137, + "step": 4882, + "task_loss": 2.698185682296753 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9753314998580918, + "compression/movement_sparsity/importance_threshold": -0.0001727716333909694, + "compression/movement_sparsity/linear_layer_sparsity": 0.9067980824412148, + "compression/movement_sparsity/model_sparsity": 0.8756467786721642, + "compression_loss": 103.51720428466797, + "distillation_loss": 4.495738506317139, + "epoch": 4.13, + "learning_rate": 3.262421339344416e-05, + "loss": 107.8718, + "step": 4883, + "task_loss": 3.1437809467315674 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9754030027124901, + "compression/movement_sparsity/importance_threshold": -0.0001722708463599192, + "compression/movement_sparsity/linear_layer_sparsity": 0.9068137269491531, + "compression/movement_sparsity/model_sparsity": 0.8756618857431266, + "compression_loss": 103.5242691040039, + "distillation_loss": 5.174014091491699, + "epoch": 4.13, + "learning_rate": 3.2619517234901855e-05, + "loss": 107.2893, + "step": 4884, + "task_loss": 3.178811550140381 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9754743672635382, + "compression/movement_sparsity/importance_threshold": -0.00017177102796885925, + "compression/movement_sparsity/linear_layer_sparsity": 0.9068892903994615, + "compression/movement_sparsity/model_sparsity": 0.8757348533564566, + "compression_loss": 103.53117370605469, + "distillation_loss": 3.480297088623047, + "epoch": 4.13, + "learning_rate": 3.261482107635954e-05, + "loss": 107.0925, + "step": 4885, + "task_loss": 1.2061827182769775 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9755455936451212, + "compression/movement_sparsity/importance_threshold": -0.00017127217728009347, + "compression/movement_sparsity/linear_layer_sparsity": 0.9069658912523543, + "compression/movement_sparsity/model_sparsity": 0.8758088227344007, + "compression_loss": 103.5381088256836, + "distillation_loss": 4.13878870010376, + "epoch": 4.13, + "learning_rate": 3.261012491781723e-05, + "loss": 108.2381, + "step": 4886, + "task_loss": 2.4863176345825195 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9756166819911242, + "compression/movement_sparsity/importance_threshold": -0.0001707742933559249, + "compression/movement_sparsity/linear_layer_sparsity": 0.9070317365060395, + "compression/movement_sparsity/model_sparsity": 0.8758724060010582, + "compression_loss": 103.54505157470703, + "distillation_loss": 4.562363147735596, + "epoch": 4.13, + "learning_rate": 3.2605428759274914e-05, + "loss": 107.5238, + "step": 4887, + "task_loss": 2.006821870803833 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9756876324354323, + "compression/movement_sparsity/importance_threshold": -0.00017027737525865828, + "compression/movement_sparsity/linear_layer_sparsity": 0.9071191048823075, + "compression/movement_sparsity/model_sparsity": 0.8759567730048249, + "compression_loss": 103.55194854736328, + "distillation_loss": 4.552233695983887, + "epoch": 4.13, + "learning_rate": 3.26007326007326e-05, + "loss": 107.9583, + "step": 4888, + "task_loss": 2.208235025405884 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9757584451119308, + "compression/movement_sparsity/importance_threshold": -0.0001697814220505984, + "compression/movement_sparsity/linear_layer_sparsity": 0.9071163027029131, + "compression/movement_sparsity/model_sparsity": 0.8759540670889131, + "compression_loss": 103.55878448486328, + "distillation_loss": 3.906327486038208, + "epoch": 4.13, + "learning_rate": 3.2596036442190294e-05, + "loss": 107.3075, + "step": 4889, + "task_loss": 1.4058661460876465 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9758291201545047, + "compression/movement_sparsity/importance_threshold": -0.00016928643279404652, + "compression/movement_sparsity/linear_layer_sparsity": 0.9070737691969559, + "compression/movement_sparsity/model_sparsity": 0.875912994739734, + "compression_loss": 103.56568908691406, + "distillation_loss": 5.188601493835449, + "epoch": 4.13, + "learning_rate": 3.259134028364797e-05, + "loss": 108.3778, + "step": 4890, + "task_loss": 3.2189619541168213 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9758996576970391, + "compression/movement_sparsity/importance_threshold": -0.00016879240655131005, + "compression/movement_sparsity/linear_layer_sparsity": 0.9071194387590014, + "compression/movement_sparsity/model_sparsity": 0.8759570954118271, + "compression_loss": 103.57255554199219, + "distillation_loss": 2.815682888031006, + "epoch": 4.13, + "learning_rate": 3.2586644125105666e-05, + "loss": 107.6905, + "step": 4891, + "task_loss": 1.708917498588562 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9759700578734194, + "compression/movement_sparsity/importance_threshold": -0.00016829934238469027, + "compression/movement_sparsity/linear_layer_sparsity": 0.9072219031314963, + "compression/movement_sparsity/model_sparsity": 0.8760560398179096, + "compression_loss": 103.57942962646484, + "distillation_loss": 3.949026584625244, + "epoch": 4.14, + "learning_rate": 3.258194796656335e-05, + "loss": 108.15, + "step": 4892, + "task_loss": 2.611680507659912 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9760403208175304, + "compression/movement_sparsity/importance_threshold": -0.0001678072393564928, + "compression/movement_sparsity/linear_layer_sparsity": 0.9071974108911722, + "compression/movement_sparsity/model_sparsity": 0.8760323889613876, + "compression_loss": 103.58628845214844, + "distillation_loss": 3.855121612548828, + "epoch": 4.14, + "learning_rate": 3.257725180802104e-05, + "loss": 108.0727, + "step": 4893, + "task_loss": 2.789794445037842 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9761104466632575, + "compression/movement_sparsity/importance_threshold": -0.0001673160965290207, + "compression/movement_sparsity/linear_layer_sparsity": 0.9072110879114506, + "compression/movement_sparsity/model_sparsity": 0.876045596133944, + "compression_loss": 103.5931625366211, + "distillation_loss": 4.876151084899902, + "epoch": 4.14, + "learning_rate": 3.2572555649478725e-05, + "loss": 107.6805, + "step": 4894, + "task_loss": 3.3561418056488037 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9761804355444857, + "compression/movement_sparsity/importance_threshold": -0.00016682591296457872, + "compression/movement_sparsity/linear_layer_sparsity": 0.9072490544612032, + "compression/movement_sparsity/model_sparsity": 0.8760822584159138, + "compression_loss": 103.60001373291016, + "distillation_loss": 3.6320643424987793, + "epoch": 4.14, + "learning_rate": 3.256785949093641e-05, + "loss": 107.8973, + "step": 4895, + "task_loss": 2.39764666557312 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9762502875951002, + "compression/movement_sparsity/importance_threshold": -0.0001663366877254699, + "compression/movement_sparsity/linear_layer_sparsity": 0.907303810238987, + "compression/movement_sparsity/model_sparsity": 0.8761351331642823, + "compression_loss": 103.60682678222656, + "distillation_loss": 3.4754083156585693, + "epoch": 4.14, + "learning_rate": 3.2563163332394105e-05, + "loss": 107.8774, + "step": 4896, + "task_loss": 2.050096273422241 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9763200029489862, + "compression/movement_sparsity/importance_threshold": -0.0001658484198739981, + "compression/movement_sparsity/linear_layer_sparsity": 0.9073405366753055, + "compression/movement_sparsity/model_sparsity": 0.8761705979345294, + "compression_loss": 103.61361694335938, + "distillation_loss": 3.674201250076294, + "epoch": 4.14, + "learning_rate": 3.255846717385179e-05, + "loss": 107.8859, + "step": 4897, + "task_loss": 2.7391393184661865 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9763895817400288, + "compression/movement_sparsity/importance_threshold": -0.000165361108472469, + "compression/movement_sparsity/linear_layer_sparsity": 0.9073619286320442, + "compression/movement_sparsity/model_sparsity": 0.8761912550117449, + "compression_loss": 103.6203842163086, + "distillation_loss": 6.105525016784668, + "epoch": 4.14, + "learning_rate": 3.2553771015309484e-05, + "loss": 108.0989, + "step": 4898, + "task_loss": 3.5748777389526367 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9764590241021129, + "compression/movement_sparsity/importance_threshold": -0.00016487475258318476, + "compression/movement_sparsity/linear_layer_sparsity": 0.9073730538804485, + "compression/movement_sparsity/model_sparsity": 0.8762019980736412, + "compression_loss": 103.6270751953125, + "distillation_loss": 4.935971736907959, + "epoch": 4.14, + "learning_rate": 3.2549074856767164e-05, + "loss": 108.6442, + "step": 4899, + "task_loss": 2.0858206748962402 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9765283301691241, + "compression/movement_sparsity/importance_threshold": -0.00016438935126845012, + "compression/movement_sparsity/linear_layer_sparsity": 0.9074356676847044, + "compression/movement_sparsity/model_sparsity": 0.8762624609010983, + "compression_loss": 103.63385009765625, + "distillation_loss": 4.4408488273620605, + "epoch": 4.14, + "learning_rate": 3.254437869822485e-05, + "loss": 107.5947, + "step": 4900, + "task_loss": 2.2618634700775146 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9765975000749472, + "compression/movement_sparsity/importance_threshold": -0.000163904903590569, + "compression/movement_sparsity/linear_layer_sparsity": 0.9075383705405521, + "compression/movement_sparsity/model_sparsity": 0.8763616355978967, + "compression_loss": 103.64054870605469, + "distillation_loss": 3.311948299407959, + "epoch": 4.14, + "learning_rate": 3.253968253968254e-05, + "loss": 107.9093, + "step": 4901, + "task_loss": 2.1013102531433105 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9766665339534673, + "compression/movement_sparsity/importance_threshold": -0.00016342140861184701, + "compression/movement_sparsity/linear_layer_sparsity": 0.9075510697790843, + "compression/movement_sparsity/model_sparsity": 0.8763738985785179, + "compression_loss": 103.64723205566406, + "distillation_loss": 3.713339328765869, + "epoch": 4.14, + "learning_rate": 3.253498638114023e-05, + "loss": 107.8918, + "step": 4902, + "task_loss": 2.350912570953369 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9767354319385698, + "compression/movement_sparsity/importance_threshold": -0.0001629388653945846, + "compression/movement_sparsity/linear_layer_sparsity": 0.9075860672110956, + "compression/movement_sparsity/model_sparsity": 0.8764076937410749, + "compression_loss": 103.65388488769531, + "distillation_loss": 5.604605674743652, + "epoch": 4.14, + "learning_rate": 3.2530290222597916e-05, + "loss": 108.2154, + "step": 4903, + "task_loss": 2.82411789894104 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9768041941641397, + "compression/movement_sparsity/importance_threshold": -0.00016245727300108828, + "compression/movement_sparsity/linear_layer_sparsity": 0.9076391178329075, + "compression/movement_sparsity/model_sparsity": 0.8764589219108248, + "compression_loss": 103.66053771972656, + "distillation_loss": 4.7300333976745605, + "epoch": 4.15, + "learning_rate": 3.25255940640556e-05, + "loss": 108.0324, + "step": 4904, + "task_loss": 2.0891354084014893 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9768728207640621, + "compression/movement_sparsity/importance_threshold": -0.00016197663049366106, + "compression/movement_sparsity/linear_layer_sparsity": 0.9077004080545559, + "compression/movement_sparsity/model_sparsity": 0.8765181066248087, + "compression_loss": 103.667236328125, + "distillation_loss": 2.6300272941589355, + "epoch": 4.15, + "learning_rate": 3.2520897905513295e-05, + "loss": 107.447, + "step": 4905, + "task_loss": 1.883621335029602 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9769413118722222, + "compression/movement_sparsity/importance_threshold": -0.0001614969369346077, + "compression/movement_sparsity/linear_layer_sparsity": 0.9077660863698942, + "compression/movement_sparsity/model_sparsity": 0.876581528687965, + "compression_loss": 103.6739273071289, + "distillation_loss": 6.206882476806641, + "epoch": 4.15, + "learning_rate": 3.251620174697098e-05, + "loss": 108.4054, + "step": 4906, + "task_loss": 3.441716194152832 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9770096676225052, + "compression/movement_sparsity/importance_threshold": -0.00016101819138623213, + "compression/movement_sparsity/linear_layer_sparsity": 0.9077905905343858, + "compression/movement_sparsity/model_sparsity": 0.8766051910590228, + "compression_loss": 103.68055725097656, + "distillation_loss": 4.013655662536621, + "epoch": 4.15, + "learning_rate": 3.251150558842866e-05, + "loss": 107.3144, + "step": 4907, + "task_loss": 1.9541929960250854 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9770778881487961, + "compression/movement_sparsity/importance_threshold": -0.00016054039291083736, + "compression/movement_sparsity/linear_layer_sparsity": 0.9078108377710316, + "compression/movement_sparsity/model_sparsity": 0.8766247427408019, + "compression_loss": 103.68716430664062, + "distillation_loss": 5.0958967208862305, + "epoch": 4.15, + "learning_rate": 3.2506809429886354e-05, + "loss": 107.7633, + "step": 4908, + "task_loss": 2.7743234634399414 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9771459735849801, + "compression/movement_sparsity/importance_threshold": -0.00016006354057072816, + "compression/movement_sparsity/linear_layer_sparsity": 0.9078627079002476, + "compression/movement_sparsity/model_sparsity": 0.8766748309715082, + "compression_loss": 103.6937255859375, + "distillation_loss": 4.230184555053711, + "epoch": 4.15, + "learning_rate": 3.250211327134404e-05, + "loss": 107.8555, + "step": 4909, + "task_loss": 2.480123519897461 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9772139240649425, + "compression/movement_sparsity/importance_threshold": -0.00015958763342820842, + "compression/movement_sparsity/linear_layer_sparsity": 0.9078116605385984, + "compression/movement_sparsity/model_sparsity": 0.8766255372437718, + "compression_loss": 103.70024871826172, + "distillation_loss": 5.658873558044434, + "epoch": 4.15, + "learning_rate": 3.2497417112801734e-05, + "loss": 108.2364, + "step": 4910, + "task_loss": 3.0606720447540283 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9772817397225682, + "compression/movement_sparsity/importance_threshold": -0.00015911267054558118, + "compression/movement_sparsity/linear_layer_sparsity": 0.9078693377374532, + "compression/movement_sparsity/model_sparsity": 0.87668123305341, + "compression_loss": 103.70674896240234, + "distillation_loss": 4.8780975341796875, + "epoch": 4.15, + "learning_rate": 3.249272095425941e-05, + "loss": 108.5508, + "step": 4911, + "task_loss": 3.2342422008514404 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9773494206917425, + "compression/movement_sparsity/importance_threshold": -0.00015863865098515208, + "compression/movement_sparsity/linear_layer_sparsity": 0.9079308783666218, + "compression/movement_sparsity/model_sparsity": 0.8767406595726456, + "compression_loss": 103.71328735351562, + "distillation_loss": 4.006883144378662, + "epoch": 4.15, + "learning_rate": 3.2488024795717106e-05, + "loss": 108.1279, + "step": 4912, + "task_loss": 1.8937087059020996 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9774169671063503, + "compression/movement_sparsity/importance_threshold": -0.00015816557380922414, + "compression/movement_sparsity/linear_layer_sparsity": 0.907959198264757, + "compression/movement_sparsity/model_sparsity": 0.8767680065951576, + "compression_loss": 103.71977233886719, + "distillation_loss": 5.814964294433594, + "epoch": 4.15, + "learning_rate": 3.248332863717479e-05, + "loss": 108.0557, + "step": 4913, + "task_loss": 3.5787041187286377 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9774843791002771, + "compression/movement_sparsity/importance_threshold": -0.00015769343808010126, + "compression/movement_sparsity/linear_layer_sparsity": 0.9080090770579777, + "compression/movement_sparsity/model_sparsity": 0.8768161718983862, + "compression_loss": 103.72622680664062, + "distillation_loss": 4.371733665466309, + "epoch": 4.15, + "learning_rate": 3.247863247863248e-05, + "loss": 107.9525, + "step": 4914, + "task_loss": 1.6969681978225708 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9775516568074077, + "compression/movement_sparsity/importance_threshold": -0.0001572222428600882, + "compression/movement_sparsity/linear_layer_sparsity": 0.9080994860969929, + "compression/movement_sparsity/model_sparsity": 0.8769034751087804, + "compression_loss": 103.73272705078125, + "distillation_loss": 3.0055060386657715, + "epoch": 4.15, + "learning_rate": 3.247393632009017e-05, + "loss": 107.7652, + "step": 4915, + "task_loss": 1.1817177534103394 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9776188003616275, + "compression/movement_sparsity/importance_threshold": -0.00015675198721148802, + "compression/movement_sparsity/linear_layer_sparsity": 0.9081801530910495, + "compression/movement_sparsity/model_sparsity": 0.8769813709434304, + "compression_loss": 103.73918151855469, + "distillation_loss": 5.603509902954102, + "epoch": 4.16, + "learning_rate": 3.246924016154785e-05, + "loss": 108.0408, + "step": 4916, + "task_loss": 3.8461949825286865 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9776858098968216, + "compression/movement_sparsity/importance_threshold": -0.00015628267019660459, + "compression/movement_sparsity/linear_layer_sparsity": 0.9082577555740237, + "compression/movement_sparsity/model_sparsity": 0.8770563075423814, + "compression_loss": 103.74560546875, + "distillation_loss": 4.520080089569092, + "epoch": 4.16, + "learning_rate": 3.2464544003005545e-05, + "loss": 107.5718, + "step": 4917, + "task_loss": 2.4631948471069336 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.977752685546875, + "compression/movement_sparsity/importance_threshold": -0.00015581429087774268, + "compression/movement_sparsity/linear_layer_sparsity": 0.9083243043535995, + "compression/movement_sparsity/model_sparsity": 0.8771205701666507, + "compression_loss": 103.75205993652344, + "distillation_loss": 4.112934112548828, + "epoch": 4.16, + "learning_rate": 3.245984784446323e-05, + "loss": 107.8388, + "step": 4918, + "task_loss": 3.36824369430542 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.977819427445673, + "compression/movement_sparsity/importance_threshold": -0.0001553468483172062, + "compression/movement_sparsity/linear_layer_sparsity": 0.9084005474814631, + "compression/movement_sparsity/model_sparsity": 0.877194194108521, + "compression_loss": 103.75852966308594, + "distillation_loss": 3.742875576019287, + "epoch": 4.16, + "learning_rate": 3.245515168592092e-05, + "loss": 108.1045, + "step": 4919, + "task_loss": 1.465295672416687 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9778860357271005, + "compression/movement_sparsity/importance_threshold": -0.00015488034157729817, + "compression/movement_sparsity/linear_layer_sparsity": 0.9084050667409972, + "compression/movement_sparsity/model_sparsity": 0.8771985581175871, + "compression_loss": 103.76490783691406, + "distillation_loss": 4.877608299255371, + "epoch": 4.16, + "learning_rate": 3.2450455527378604e-05, + "loss": 107.9625, + "step": 4920, + "task_loss": 2.666753053665161 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.977952510525043, + "compression/movement_sparsity/importance_threshold": -0.00015441476972032336, + "compression/movement_sparsity/linear_layer_sparsity": 0.9083827923958534, + "compression/movement_sparsity/model_sparsity": 0.8771770489647229, + "compression_loss": 103.7712631225586, + "distillation_loss": 3.0660858154296875, + "epoch": 4.16, + "learning_rate": 3.244575936883629e-05, + "loss": 107.9188, + "step": 4921, + "task_loss": 2.8454513549804688 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9780188519733853, + "compression/movement_sparsity/importance_threshold": -0.00015395013180858567, + "compression/movement_sparsity/linear_layer_sparsity": 0.9084047924851415, + "compression/movement_sparsity/model_sparsity": 0.8771982932832638, + "compression_loss": 103.77765655517578, + "distillation_loss": 5.5451860427856445, + "epoch": 4.16, + "learning_rate": 3.2441063210293983e-05, + "loss": 109.4399, + "step": 4922, + "task_loss": 3.351867914199829 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9780850602060127, + "compression/movement_sparsity/importance_threshold": -0.00015348642690438986, + "compression/movement_sparsity/linear_layer_sparsity": 0.9084170386053035, + "compression/movement_sparsity/model_sparsity": 0.8772101187115249, + "compression_loss": 103.78401947021484, + "distillation_loss": 4.588109016418457, + "epoch": 4.16, + "learning_rate": 3.243636705175167e-05, + "loss": 107.4691, + "step": 4923, + "task_loss": 2.8633599281311035 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9781511353568103, + "compression/movement_sparsity/importance_threshold": -0.00015302365407003898, + "compression/movement_sparsity/linear_layer_sparsity": 0.9084920416197332, + "compression/movement_sparsity/model_sparsity": 0.8772825451416725, + "compression_loss": 103.7903823852539, + "distillation_loss": 4.177319049835205, + "epoch": 4.16, + "learning_rate": 3.2431670893209356e-05, + "loss": 108.6148, + "step": 4924, + "task_loss": 1.9912488460540771 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9782170775596634, + "compression/movement_sparsity/importance_threshold": -0.0001525618123678369, + "compression/movement_sparsity/linear_layer_sparsity": 0.9085561697932788, + "compression/movement_sparsity/model_sparsity": 0.8773444703151755, + "compression_loss": 103.7967758178711, + "distillation_loss": 3.715540885925293, + "epoch": 4.16, + "learning_rate": 3.242697473466704e-05, + "loss": 107.8677, + "step": 4925, + "task_loss": 1.77749764919281 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9782828869484569, + "compression/movement_sparsity/importance_threshold": -0.00015210090086008755, + "compression/movement_sparsity/linear_layer_sparsity": 0.9086346904371609, + "compression/movement_sparsity/model_sparsity": 0.8774202935333826, + "compression_loss": 103.8031234741211, + "distillation_loss": 5.273666858673096, + "epoch": 4.16, + "learning_rate": 3.242227857612473e-05, + "loss": 108.6352, + "step": 4926, + "task_loss": 2.6297898292541504 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9783485636570761, + "compression/movement_sparsity/importance_threshold": -0.0001516409186090948, + "compression/movement_sparsity/linear_layer_sparsity": 0.9087253141111935, + "compression/movement_sparsity/model_sparsity": 0.877507804005421, + "compression_loss": 103.80950164794922, + "distillation_loss": 5.81418514251709, + "epoch": 4.16, + "learning_rate": 3.241758241758242e-05, + "loss": 108.263, + "step": 4927, + "task_loss": 2.356076240539551 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9784141078194061, + "compression/movement_sparsity/importance_threshold": -0.00015118186467716344, + "compression/movement_sparsity/linear_layer_sparsity": 0.9087867712711888, + "compression/movement_sparsity/model_sparsity": 0.8775671499229062, + "compression_loss": 103.81583404541016, + "distillation_loss": 4.095522880554199, + "epoch": 4.17, + "learning_rate": 3.241288625904011e-05, + "loss": 108.1185, + "step": 4928, + "task_loss": 1.751255750656128 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9784795195693319, + "compression/movement_sparsity/importance_threshold": -0.00015072373812659821, + "compression/movement_sparsity/linear_layer_sparsity": 0.9087989816188479, + "compression/movement_sparsity/model_sparsity": 0.8775789408075597, + "compression_loss": 103.82217407226562, + "distillation_loss": 4.528531074523926, + "epoch": 4.17, + "learning_rate": 3.2408190100497795e-05, + "loss": 108.7427, + "step": 4929, + "task_loss": 2.6554863452911377 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9785447990407389, + "compression/movement_sparsity/importance_threshold": -0.00015026653801970043, + "compression/movement_sparsity/linear_layer_sparsity": 0.9087965490886502, + "compression/movement_sparsity/model_sparsity": 0.8775765918422577, + "compression_loss": 103.82848358154297, + "distillation_loss": 4.656617164611816, + "epoch": 4.17, + "learning_rate": 3.240349394195548e-05, + "loss": 108.6566, + "step": 4930, + "task_loss": 2.644467353820801 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9786099463675121, + "compression/movement_sparsity/importance_threshold": -0.00014981026341877573, + "compression/movement_sparsity/linear_layer_sparsity": 0.9088196938980314, + "compression/movement_sparsity/model_sparsity": 0.8775989415562349, + "compression_loss": 103.83478546142578, + "distillation_loss": 4.464627742767334, + "epoch": 4.17, + "learning_rate": 3.239879778341317e-05, + "loss": 108.2843, + "step": 4931, + "task_loss": 2.8621952533721924 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9786749616835365, + "compression/movement_sparsity/importance_threshold": -0.00014935491338612886, + "compression/movement_sparsity/linear_layer_sparsity": 0.9088313438098117, + "compression/movement_sparsity/model_sparsity": 0.8776101912577061, + "compression_loss": 103.84107208251953, + "distillation_loss": 4.504247188568115, + "epoch": 4.17, + "learning_rate": 3.239410162487086e-05, + "loss": 108.6984, + "step": 4932, + "task_loss": 3.1847739219665527 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9787398451226975, + "compression/movement_sparsity/importance_threshold": -0.000148900486984062, + "compression/movement_sparsity/linear_layer_sparsity": 0.9088649461142095, + "compression/movement_sparsity/model_sparsity": 0.8776426392195752, + "compression_loss": 103.84729766845703, + "distillation_loss": 4.58244514465332, + "epoch": 4.17, + "learning_rate": 3.238940546632854e-05, + "loss": 107.6181, + "step": 4933, + "task_loss": 2.603705883026123 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9788045968188802, + "compression/movement_sparsity/importance_threshold": -0.00014844698327487905, + "compression/movement_sparsity/linear_layer_sparsity": 0.9088454858726278, + "compression/movement_sparsity/model_sparsity": 0.8776238474971585, + "compression_loss": 103.8534927368164, + "distillation_loss": 3.9984004497528076, + "epoch": 4.17, + "learning_rate": 3.238470930778623e-05, + "loss": 107.2262, + "step": 4934, + "task_loss": 1.6735730171203613 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9788692169059696, + "compression/movement_sparsity/importance_threshold": -0.0001479944013208865, + "compression/movement_sparsity/linear_layer_sparsity": 0.9089138471256841, + "compression/movement_sparsity/model_sparsity": 0.8776898603308686, + "compression_loss": 103.85972595214844, + "distillation_loss": 5.435490131378174, + "epoch": 4.17, + "learning_rate": 3.238001314924392e-05, + "loss": 108.7948, + "step": 4935, + "task_loss": 2.7765865325927734 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.978933705517851, + "compression/movement_sparsity/importance_threshold": -0.00014754274018438564, + "compression/movement_sparsity/linear_layer_sparsity": 0.9089759243423965, + "compression/movement_sparsity/model_sparsity": 0.877749805004215, + "compression_loss": 103.86590576171875, + "distillation_loss": 3.8262877464294434, + "epoch": 4.17, + "learning_rate": 3.237531699070161e-05, + "loss": 108.09, + "step": 4936, + "task_loss": 1.668071985244751 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9789980627884093, + "compression/movement_sparsity/importance_threshold": -0.000147091998927683, + "compression/movement_sparsity/linear_layer_sparsity": 0.9090050670080985, + "compression/movement_sparsity/model_sparsity": 0.8777779465296968, + "compression_loss": 103.87210083007812, + "distillation_loss": 3.899608612060547, + "epoch": 4.17, + "learning_rate": 3.237062083215929e-05, + "loss": 107.5575, + "step": 4937, + "task_loss": 2.410928964614868 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9790622888515299, + "compression/movement_sparsity/importance_threshold": -0.00014664217661307984, + "compression/movement_sparsity/linear_layer_sparsity": 0.9089711665995097, + "compression/movement_sparsity/model_sparsity": 0.8777452107044329, + "compression_loss": 103.87832641601562, + "distillation_loss": 6.1531877517700195, + "epoch": 4.17, + "learning_rate": 3.2365924673616985e-05, + "loss": 109.1951, + "step": 4938, + "task_loss": 3.091554641723633 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9791263838410977, + "compression/movement_sparsity/importance_threshold": -0.00014619327230288182, + "compression/movement_sparsity/linear_layer_sparsity": 0.9089893748034897, + "compression/movement_sparsity/model_sparsity": 0.8777627934005912, + "compression_loss": 103.88449096679688, + "distillation_loss": 3.6427648067474365, + "epoch": 4.17, + "learning_rate": 3.236122851507467e-05, + "loss": 108.0407, + "step": 4939, + "task_loss": 1.7808279991149902 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9791903478909981, + "compression/movement_sparsity/importance_threshold": -0.00014574528505939283, + "compression/movement_sparsity/linear_layer_sparsity": 0.9088964736134387, + "compression/movement_sparsity/model_sparsity": 0.877673083652216, + "compression_loss": 103.89066314697266, + "distillation_loss": 4.707060813903809, + "epoch": 4.18, + "learning_rate": 3.235653235653236e-05, + "loss": 108.5641, + "step": 4940, + "task_loss": 1.8706927299499512 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9792541811351161, + "compression/movement_sparsity/importance_threshold": -0.0001452982139449159, + "compression/movement_sparsity/linear_layer_sparsity": 0.9089289550460788, + "compression/movement_sparsity/model_sparsity": 0.8777044492477203, + "compression_loss": 103.89676666259766, + "distillation_loss": 6.042444705963135, + "epoch": 4.18, + "learning_rate": 3.2351836197990044e-05, + "loss": 108.3449, + "step": 4941, + "task_loss": 2.5021352767944336 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9793178837073369, + "compression/movement_sparsity/importance_threshold": -0.0001448520580217558, + "compression/movement_sparsity/linear_layer_sparsity": 0.9088778242152562, + "compression/movement_sparsity/model_sparsity": 0.8776550749182332, + "compression_loss": 103.90290069580078, + "distillation_loss": 4.144269943237305, + "epoch": 4.18, + "learning_rate": 3.234714003944773e-05, + "loss": 107.3744, + "step": 4942, + "task_loss": 2.4927775859832764 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9793814557415456, + "compression/movement_sparsity/importance_threshold": -0.00014440681635221643, + "compression/movement_sparsity/linear_layer_sparsity": 0.9089402830053329, + "compression/movement_sparsity/model_sparsity": 0.8777153880567251, + "compression_loss": 103.90899658203125, + "distillation_loss": 4.880977630615234, + "epoch": 4.18, + "learning_rate": 3.2342443880905424e-05, + "loss": 108.66, + "step": 4943, + "task_loss": 2.876920223236084 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9794448973716272, + "compression/movement_sparsity/importance_threshold": -0.00014396248799860168, + "compression/movement_sparsity/linear_layer_sparsity": 0.9089814929286825, + "compression/movement_sparsity/model_sparsity": 0.877755182292431, + "compression_loss": 103.91514587402344, + "distillation_loss": 3.806028366088867, + "epoch": 4.18, + "learning_rate": 3.233774772236311e-05, + "loss": 108.3256, + "step": 4944, + "task_loss": 1.8639618158340454 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.979508208731467, + "compression/movement_sparsity/importance_threshold": -0.00014351907202321632, + "compression/movement_sparsity/linear_layer_sparsity": 0.9089820056678908, + "compression/movement_sparsity/model_sparsity": 0.8777556774174702, + "compression_loss": 103.92124938964844, + "distillation_loss": 6.151072025299072, + "epoch": 4.18, + "learning_rate": 3.2333051563820796e-05, + "loss": 108.0019, + "step": 4945, + "task_loss": 3.5995988845825195 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9795713899549502, + "compression/movement_sparsity/importance_threshold": -0.00014307656748836252, + "compression/movement_sparsity/linear_layer_sparsity": 0.9090139743613225, + "compression/movement_sparsity/model_sparsity": 0.8777865478879353, + "compression_loss": 103.9273452758789, + "distillation_loss": 5.25455379486084, + "epoch": 4.18, + "learning_rate": 3.232835540527848e-05, + "loss": 108.6378, + "step": 4946, + "task_loss": 2.0346150398254395 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9796344411759619, + "compression/movement_sparsity/importance_threshold": -0.00014263497345634504, + "compression/movement_sparsity/linear_layer_sparsity": 0.9091227943151674, + "compression/movement_sparsity/model_sparsity": 0.8778916295415963, + "compression_loss": 103.93344116210938, + "distillation_loss": 5.238717079162598, + "epoch": 4.18, + "learning_rate": 3.232365924673617e-05, + "loss": 108.3677, + "step": 4947, + "task_loss": 2.6636807918548584 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9796973625283871, + "compression/movement_sparsity/importance_threshold": -0.00014219428898946864, + "compression/movement_sparsity/linear_layer_sparsity": 0.9091323813459466, + "compression/movement_sparsity/model_sparsity": 0.8779008872283751, + "compression_loss": 103.93955993652344, + "distillation_loss": 3.691100597381592, + "epoch": 4.18, + "learning_rate": 3.231896308819386e-05, + "loss": 108.4561, + "step": 4948, + "task_loss": 2.0553925037384033 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9797601541461111, + "compression/movement_sparsity/importance_threshold": -0.00014175451315003723, + "compression/movement_sparsity/linear_layer_sparsity": 0.9092285974546004, + "compression/movement_sparsity/model_sparsity": 0.8779937980177013, + "compression_loss": 103.94566345214844, + "distillation_loss": 6.505134582519531, + "epoch": 4.18, + "learning_rate": 3.231426692965155e-05, + "loss": 109.3271, + "step": 4949, + "task_loss": 3.639892339706421 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9798228161630189, + "compression/movement_sparsity/importance_threshold": -0.00014131564500035384, + "compression/movement_sparsity/linear_layer_sparsity": 0.9092975787643739, + "compression/movement_sparsity/model_sparsity": 0.8780604096072727, + "compression_loss": 103.95176696777344, + "distillation_loss": 4.966371536254883, + "epoch": 4.18, + "learning_rate": 3.2309570771109235e-05, + "loss": 108.7232, + "step": 4950, + "task_loss": 2.5394816398620605 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9798853487129957, + "compression/movement_sparsity/importance_threshold": -0.0001408776836027241, + "compression/movement_sparsity/linear_layer_sparsity": 0.9093315745663038, + "compression/movement_sparsity/model_sparsity": 0.878093237548823, + "compression_loss": 103.95781707763672, + "distillation_loss": 3.184843063354492, + "epoch": 4.19, + "learning_rate": 3.230487461256692e-05, + "loss": 108.213, + "step": 4951, + "task_loss": 1.816812515258789 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9799477519299268, + "compression/movement_sparsity/importance_threshold": -0.0001404406280194493, + "compression/movement_sparsity/linear_layer_sparsity": 0.9093538369872799, + "compression/movement_sparsity/model_sparsity": 0.8781147351871513, + "compression_loss": 103.96385955810547, + "distillation_loss": 3.5028109550476074, + "epoch": 4.19, + "learning_rate": 3.230017845402461e-05, + "loss": 108.2512, + "step": 4952, + "task_loss": 1.4506467580795288 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9800100259476971, + "compression/movement_sparsity/importance_threshold": -0.00014000447731283509, + "compression/movement_sparsity/linear_layer_sparsity": 0.9094300920393112, + "compression/movement_sparsity/model_sparsity": 0.8781883706435574, + "compression_loss": 103.96989440917969, + "distillation_loss": 3.575268268585205, + "epoch": 4.19, + "learning_rate": 3.22954822954823e-05, + "loss": 107.7554, + "step": 4953, + "task_loss": 1.6225048303604126 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9800721709001919, + "compression/movement_sparsity/importance_threshold": -0.00013956923054518535, + "compression/movement_sparsity/linear_layer_sparsity": 0.9094287446083683, + "compression/movement_sparsity/model_sparsity": 0.8781870695010127, + "compression_loss": 103.97588348388672, + "distillation_loss": 4.586324691772461, + "epoch": 4.19, + "learning_rate": 3.229078613693998e-05, + "loss": 108.4797, + "step": 4954, + "task_loss": 2.459829092025757 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9801341869212963, + "compression/movement_sparsity/importance_threshold": -0.00013913488677880486, + "compression/movement_sparsity/linear_layer_sparsity": 0.9094863621863849, + "compression/movement_sparsity/model_sparsity": 0.8782427077379719, + "compression_loss": 103.98187255859375, + "distillation_loss": 4.677105903625488, + "epoch": 4.19, + "learning_rate": 3.228608997839767e-05, + "loss": 108.5416, + "step": 4955, + "task_loss": 3.102701425552368 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9801960741448954, + "compression/movement_sparsity/importance_threshold": -0.00013870144507599578, + "compression/movement_sparsity/linear_layer_sparsity": 0.9094686547974455, + "compression/movement_sparsity/model_sparsity": 0.8782256086523169, + "compression_loss": 103.98786926269531, + "distillation_loss": 3.6570844650268555, + "epoch": 4.19, + "learning_rate": 3.228139381985536e-05, + "loss": 108.175, + "step": 4956, + "task_loss": 1.8535711765289307 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9802578327048743, + "compression/movement_sparsity/importance_threshold": -0.00013826890449906287, + "compression/movement_sparsity/linear_layer_sparsity": 0.9095566670787659, + "compression/movement_sparsity/model_sparsity": 0.8783105974410165, + "compression_loss": 103.99382781982422, + "distillation_loss": 5.6279778480529785, + "epoch": 4.19, + "learning_rate": 3.2276697661313046e-05, + "loss": 108.555, + "step": 4957, + "task_loss": 2.464423894882202 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9803194627351183, + "compression/movement_sparsity/importance_threshold": -0.0001378372641103109, + "compression/movement_sparsity/linear_layer_sparsity": 0.9096375964045105, + "compression/movement_sparsity/model_sparsity": 0.8783887465954541, + "compression_loss": 103.9997787475586, + "distillation_loss": 5.5202484130859375, + "epoch": 4.19, + "learning_rate": 3.227200150277074e-05, + "loss": 108.7242, + "step": 4958, + "task_loss": 2.882256507873535 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9803809643695124, + "compression/movement_sparsity/importance_threshold": -0.00013740652297204378, + "compression/movement_sparsity/linear_layer_sparsity": 0.909685126136707, + "compression/movement_sparsity/model_sparsity": 0.878434643535131, + "compression_loss": 104.00574493408203, + "distillation_loss": 4.985022068023682, + "epoch": 4.19, + "learning_rate": 3.226730534422842e-05, + "loss": 108.5123, + "step": 4959, + "task_loss": 2.5230209827423096 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9804423377419418, + "compression/movement_sparsity/importance_threshold": -0.00013697668014656366, + "compression/movement_sparsity/linear_layer_sparsity": 0.9096312169748253, + "compression/movement_sparsity/model_sparsity": 0.878382586318804, + "compression_loss": 104.0115737915039, + "distillation_loss": 4.303328514099121, + "epoch": 4.19, + "learning_rate": 3.226260918568611e-05, + "loss": 108.48, + "step": 4960, + "task_loss": 1.6808784008026123 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9805035829862916, + "compression/movement_sparsity/importance_threshold": -0.00013654773469617704, + "compression/movement_sparsity/linear_layer_sparsity": 0.9096204136789472, + "compression/movement_sparsity/model_sparsity": 0.8783721541493741, + "compression_loss": 104.0174789428711, + "distillation_loss": 3.7154412269592285, + "epoch": 4.19, + "learning_rate": 3.22579130271438e-05, + "loss": 108.1768, + "step": 4961, + "task_loss": 2.5633575916290283 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.980564700236447, + "compression/movement_sparsity/importance_threshold": -0.00013611968568318523, + "compression/movement_sparsity/linear_layer_sparsity": 0.909702809677311, + "compression/movement_sparsity/model_sparsity": 0.8784517195917144, + "compression_loss": 104.02338409423828, + "distillation_loss": 3.5759315490722656, + "epoch": 4.19, + "learning_rate": 3.225321686860149e-05, + "loss": 107.8621, + "step": 4962, + "task_loss": 2.4822640419006348 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9806256896262932, + "compression/movement_sparsity/importance_threshold": -0.00013569253216989471, + "compression/movement_sparsity/linear_layer_sparsity": 0.9099019909735004, + "compression/movement_sparsity/model_sparsity": 0.8786440583976264, + "compression_loss": 104.02925109863281, + "distillation_loss": 3.9838249683380127, + "epoch": 4.2, + "learning_rate": 3.224852071005917e-05, + "loss": 108.0341, + "step": 4963, + "task_loss": 2.170424699783325 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.980686551289715, + "compression/movement_sparsity/importance_threshold": -0.00013526627321860853, + "compression/movement_sparsity/linear_layer_sparsity": 0.9099067964130577, + "compression/movement_sparsity/model_sparsity": 0.8786486987555516, + "compression_loss": 104.03507995605469, + "distillation_loss": 3.334512233734131, + "epoch": 4.2, + "learning_rate": 3.224382455151686e-05, + "loss": 108.1032, + "step": 4964, + "task_loss": 2.5189616680145264 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.980747285360598, + "compression/movement_sparsity/importance_threshold": -0.00013484090789163058, + "compression/movement_sparsity/linear_layer_sparsity": 0.9099511304683279, + "compression/movement_sparsity/model_sparsity": 0.8786915097996357, + "compression_loss": 104.04090881347656, + "distillation_loss": 3.6471285820007324, + "epoch": 4.2, + "learning_rate": 3.223912839297455e-05, + "loss": 108.2061, + "step": 4965, + "task_loss": 1.9344518184661865 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.980807891972827, + "compression/movement_sparsity/importance_threshold": -0.0001344164352512639, + "compression/movement_sparsity/linear_layer_sparsity": 0.909972379335055, + "compression/movement_sparsity/model_sparsity": 0.8787120287024216, + "compression_loss": 104.04671478271484, + "distillation_loss": 3.805270195007324, + "epoch": 4.2, + "learning_rate": 3.2234432234432237e-05, + "loss": 108.5571, + "step": 4966, + "task_loss": 1.9280987977981567 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9808683712602874, + "compression/movement_sparsity/importance_threshold": -0.0001339928543598141, + "compression/movement_sparsity/linear_layer_sparsity": 0.909907702649798, + "compression/movement_sparsity/model_sparsity": 0.878649573860272, + "compression_loss": 104.05254364013672, + "distillation_loss": 4.468003749847412, + "epoch": 4.2, + "learning_rate": 3.222973607588992e-05, + "loss": 108.4487, + "step": 4967, + "task_loss": 2.7226998805999756 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9809287233568642, + "compression/movement_sparsity/importance_threshold": -0.00013357016427958338, + "compression/movement_sparsity/linear_layer_sparsity": 0.9099947848460428, + "compression/movement_sparsity/model_sparsity": 0.8787336645151795, + "compression_loss": 104.05830383300781, + "distillation_loss": 5.158664703369141, + "epoch": 4.2, + "learning_rate": 3.222503991734761e-05, + "loss": 108.6758, + "step": 4968, + "task_loss": 2.4551422595977783 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9809889483964425, + "compression/movement_sparsity/importance_threshold": -0.00013314836407287822, + "compression/movement_sparsity/linear_layer_sparsity": 0.9100253584118612, + "compression/movement_sparsity/model_sparsity": 0.8787631877849567, + "compression_loss": 104.0639877319336, + "distillation_loss": 5.051945686340332, + "epoch": 4.2, + "learning_rate": 3.22203437588053e-05, + "loss": 108.5161, + "step": 4969, + "task_loss": 2.524357557296753 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9810490465129075, + "compression/movement_sparsity/importance_threshold": -0.0001327274528019999, + "compression/movement_sparsity/linear_layer_sparsity": 0.9100611070664334, + "compression/movement_sparsity/model_sparsity": 0.8787977083632688, + "compression_loss": 104.06974029541016, + "distillation_loss": 5.046250820159912, + "epoch": 4.2, + "learning_rate": 3.221564760026299e-05, + "loss": 108.9531, + "step": 4970, + "task_loss": 3.820064067840576 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9811090178401443, + "compression/movement_sparsity/importance_threshold": -0.00013230742952925496, + "compression/movement_sparsity/linear_layer_sparsity": 0.9100581260245245, + "compression/movement_sparsity/model_sparsity": 0.8787948297293201, + "compression_loss": 104.075439453125, + "distillation_loss": 3.7273898124694824, + "epoch": 4.2, + "learning_rate": 3.221095144172067e-05, + "loss": 108.5203, + "step": 4971, + "task_loss": 2.4891371726989746 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.981168862512038, + "compression/movement_sparsity/importance_threshold": -0.00013188829331694553, + "compression/movement_sparsity/linear_layer_sparsity": 0.9100377118495319, + "compression/movement_sparsity/model_sparsity": 0.8787751168440399, + "compression_loss": 104.08113861083984, + "distillation_loss": 3.618380069732666, + "epoch": 4.2, + "learning_rate": 3.220625528317836e-05, + "loss": 108.8195, + "step": 4972, + "task_loss": 2.506181240081787 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.981228580662474, + "compression/movement_sparsity/importance_threshold": -0.00013147004322737552, + "compression/movement_sparsity/linear_layer_sparsity": 0.9100644816058744, + "compression/movement_sparsity/model_sparsity": 0.8788009669768986, + "compression_loss": 104.08683013916016, + "distillation_loss": 4.888124465942383, + "epoch": 4.2, + "learning_rate": 3.220155912463605e-05, + "loss": 108.679, + "step": 4973, + "task_loss": 2.743950843811035 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9812881724253372, + "compression/movement_sparsity/importance_threshold": -0.0001310526783228497, + "compression/movement_sparsity/linear_layer_sparsity": 0.9100372110344912, + "compression/movement_sparsity/model_sparsity": 0.8787746332335364, + "compression_loss": 104.09255981445312, + "distillation_loss": 4.180861473083496, + "epoch": 4.2, + "learning_rate": 3.219686296609374e-05, + "loss": 108.3442, + "step": 4974, + "task_loss": 2.295189142227173 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9813476379345127, + "compression/movement_sparsity/importance_threshold": -0.00013063619766567197, + "compression/movement_sparsity/linear_layer_sparsity": 0.9100385823137693, + "compression/movement_sparsity/model_sparsity": 0.8787759574051528, + "compression_loss": 104.09825134277344, + "distillation_loss": 4.914698123931885, + "epoch": 4.21, + "learning_rate": 3.219216680755143e-05, + "loss": 108.5093, + "step": 4975, + "task_loss": 2.9518239498138428 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9814069773238858, + "compression/movement_sparsity/importance_threshold": -0.0001302206003181471, + "compression/movement_sparsity/linear_layer_sparsity": 0.9100127068699995, + "compression/movement_sparsity/model_sparsity": 0.8787509708624787, + "compression_loss": 104.10387420654297, + "distillation_loss": 4.6972761154174805, + "epoch": 4.21, + "learning_rate": 3.2187470649009114e-05, + "loss": 108.6708, + "step": 4976, + "task_loss": 2.975581645965576 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9814661907273416, + "compression/movement_sparsity/importance_threshold": -0.00012980588534257636, + "compression/movement_sparsity/linear_layer_sparsity": 0.9100781586261527, + "compression/movement_sparsity/model_sparsity": 0.878814174149455, + "compression_loss": 104.10954284667969, + "distillation_loss": 3.283097982406616, + "epoch": 4.21, + "learning_rate": 3.21827744904668e-05, + "loss": 107.614, + "step": 4977, + "task_loss": 1.8064824342727661 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9815252782787652, + "compression/movement_sparsity/importance_threshold": -0.00012939205180126714, + "compression/movement_sparsity/linear_layer_sparsity": 0.9100707775663861, + "compression/movement_sparsity/model_sparsity": 0.8788070466517981, + "compression_loss": 104.11517333984375, + "distillation_loss": 3.0536389350891113, + "epoch": 4.21, + "learning_rate": 3.2178078331924486e-05, + "loss": 108.659, + "step": 4978, + "task_loss": 1.1758376359939575 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9815842401120418, + "compression/movement_sparsity/importance_threshold": -0.00012897909875652074, + "compression/movement_sparsity/linear_layer_sparsity": 0.9100524262723946, + "compression/movement_sparsity/model_sparsity": 0.8787893257812103, + "compression_loss": 104.12081909179688, + "distillation_loss": 4.381033897399902, + "epoch": 4.21, + "learning_rate": 3.217338217338218e-05, + "loss": 108.1671, + "step": 4979, + "task_loss": 2.1976146697998047 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9816430763610565, + "compression/movement_sparsity/importance_threshold": -0.0001285670252706428, + "compression/movement_sparsity/linear_layer_sparsity": 0.910067248012766, + "compression/movement_sparsity/model_sparsity": 0.8788036383492029, + "compression_loss": 104.12648010253906, + "distillation_loss": 4.64116096496582, + "epoch": 4.21, + "learning_rate": 3.216868601483986e-05, + "loss": 108.2276, + "step": 4980, + "task_loss": 2.452749729156494 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9817017871596944, + "compression/movement_sparsity/importance_threshold": -0.0001281558304059363, + "compression/movement_sparsity/linear_layer_sparsity": 0.9101423344963689, + "compression/movement_sparsity/model_sparsity": 0.8788761453811011, + "compression_loss": 104.13203430175781, + "distillation_loss": 6.146230220794678, + "epoch": 4.21, + "learning_rate": 3.216398985629755e-05, + "loss": 108.974, + "step": 4981, + "task_loss": 2.807964563369751 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9817603726418407, + "compression/movement_sparsity/importance_threshold": -0.00012774551322470608, + "compression/movement_sparsity/linear_layer_sparsity": 0.910203636642185, + "compression/movement_sparsity/model_sparsity": 0.8789353416096208, + "compression_loss": 104.13766479492188, + "distillation_loss": 5.033144950866699, + "epoch": 4.21, + "learning_rate": 3.215929369775524e-05, + "loss": 108.0734, + "step": 4982, + "task_loss": 3.7367429733276367 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9818188329413805, + "compression/movement_sparsity/importance_threshold": -0.00012733607278925512, + "compression/movement_sparsity/linear_layer_sparsity": 0.9101836875097301, + "compression/movement_sparsity/model_sparsity": 0.8789160777912366, + "compression_loss": 104.1432876586914, + "distillation_loss": 6.25330924987793, + "epoch": 4.21, + "learning_rate": 3.2154597539212925e-05, + "loss": 109.0978, + "step": 4983, + "task_loss": 3.21545147895813 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.981877168192199, + "compression/movement_sparsity/importance_threshold": -0.0001269275081618882, + "compression/movement_sparsity/linear_layer_sparsity": 0.9102833973995013, + "compression/movement_sparsity/model_sparsity": 0.8790123623395506, + "compression_loss": 104.1488037109375, + "distillation_loss": 3.7482471466064453, + "epoch": 4.21, + "learning_rate": 3.214990138067061e-05, + "loss": 108.0268, + "step": 4984, + "task_loss": 2.922497510910034 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9819353785281812, + "compression/movement_sparsity/importance_threshold": -0.00012651981840490923, + "compression/movement_sparsity/linear_layer_sparsity": 0.9103520567567485, + "compression/movement_sparsity/model_sparsity": 0.8790786630366555, + "compression_loss": 104.15441131591797, + "distillation_loss": 4.08882999420166, + "epoch": 4.21, + "learning_rate": 3.21452052221283e-05, + "loss": 108.7037, + "step": 4985, + "task_loss": 3.259140729904175 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9819934640832124, + "compression/movement_sparsity/importance_threshold": -0.0001261130025806221, + "compression/movement_sparsity/linear_layer_sparsity": 0.9103514128516962, + "compression/movement_sparsity/model_sparsity": 0.8790780412517226, + "compression_loss": 104.15991973876953, + "distillation_loss": 4.379223346710205, + "epoch": 4.21, + "learning_rate": 3.214050906358599e-05, + "loss": 108.5853, + "step": 4986, + "task_loss": 2.260141372680664 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9820514249911777, + "compression/movement_sparsity/importance_threshold": -0.0001257070597513307, + "compression/movement_sparsity/linear_layer_sparsity": 0.910390846074068, + "compression/movement_sparsity/model_sparsity": 0.8791161198215952, + "compression_loss": 104.16545867919922, + "distillation_loss": 4.392611503601074, + "epoch": 4.22, + "learning_rate": 3.213581290504368e-05, + "loss": 108.2725, + "step": 4987, + "task_loss": 2.0115554332733154 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9821092613859622, + "compression/movement_sparsity/importance_threshold": -0.00012530198897933807, + "compression/movement_sparsity/linear_layer_sparsity": 0.9104344169826094, + "compression/movement_sparsity/model_sparsity": 0.8791581939353884, + "compression_loss": 104.17098999023438, + "distillation_loss": 5.762123107910156, + "epoch": 4.22, + "learning_rate": 3.213111674650136e-05, + "loss": 108.8023, + "step": 4988, + "task_loss": 4.737924098968506 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9821669734014511, + "compression/movement_sparsity/importance_threshold": -0.00012489778932694984, + "compression/movement_sparsity/linear_layer_sparsity": 0.9104602089572058, + "compression/movement_sparsity/model_sparsity": 0.879183099876312, + "compression_loss": 104.17650604248047, + "distillation_loss": 3.920383930206299, + "epoch": 4.22, + "learning_rate": 3.212642058795905e-05, + "loss": 108.088, + "step": 4989, + "task_loss": 2.5375595092773438 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9822245611715296, + "compression/movement_sparsity/importance_threshold": -0.00012449445985646904, + "compression/movement_sparsity/linear_layer_sparsity": 0.9104644539608842, + "compression/movement_sparsity/model_sparsity": 0.8791871990510549, + "compression_loss": 104.18199920654297, + "distillation_loss": 3.7607803344726562, + "epoch": 4.22, + "learning_rate": 3.2121724429416736e-05, + "loss": 108.3436, + "step": 4990, + "task_loss": 3.2512242794036865 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9822820248300825, + "compression/movement_sparsity/importance_threshold": -0.00012409199963020045, + "compression/movement_sparsity/linear_layer_sparsity": 0.9104931315840484, + "compression/movement_sparsity/model_sparsity": 0.8792148915096407, + "compression_loss": 104.1875, + "distillation_loss": 4.936339378356934, + "epoch": 4.22, + "learning_rate": 3.211702827087443e-05, + "loss": 108.5708, + "step": 4991, + "task_loss": 1.9474214315414429 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9823393645109953, + "compression/movement_sparsity/importance_threshold": -0.00012369040771044709, + "compression/movement_sparsity/linear_layer_sparsity": 0.9104792518529202, + "compression/movement_sparsity/model_sparsity": 0.8792014885899759, + "compression_loss": 104.19292449951172, + "distillation_loss": 5.581599235534668, + "epoch": 4.22, + "learning_rate": 3.2112332112332115e-05, + "loss": 108.1149, + "step": 4992, + "task_loss": 3.2073309421539307 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9823965803481531, + "compression/movement_sparsity/importance_threshold": -0.00012328968315951285, + "compression/movement_sparsity/linear_layer_sparsity": 0.9104965895926628, + "compression/movement_sparsity/model_sparsity": 0.8792182307250211, + "compression_loss": 104.19840240478516, + "distillation_loss": 6.594862937927246, + "epoch": 4.22, + "learning_rate": 3.21076359537898e-05, + "loss": 108.6041, + "step": 4993, + "task_loss": 3.327862501144409 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9824536724754409, + "compression/movement_sparsity/importance_threshold": -0.00012288982503970338, + "compression/movement_sparsity/linear_layer_sparsity": 0.9104894231679137, + "compression/movement_sparsity/model_sparsity": 0.8792113104890086, + "compression_loss": 104.20378875732422, + "distillation_loss": 4.68754243850708, + "epoch": 4.22, + "learning_rate": 3.210293979524749e-05, + "loss": 108.781, + "step": 4994, + "task_loss": 2.786020278930664 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9825106410267439, + "compression/movement_sparsity/importance_threshold": -0.00012249083241332084, + "compression/movement_sparsity/linear_layer_sparsity": 0.9104910448547121, + "compression/movement_sparsity/model_sparsity": 0.8792128764658766, + "compression_loss": 104.20924377441406, + "distillation_loss": 4.950472354888916, + "epoch": 4.22, + "learning_rate": 3.2098243636705174e-05, + "loss": 108.1564, + "step": 4995, + "task_loss": 2.900503158569336 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9825674861359474, + "compression/movement_sparsity/importance_threshold": -0.00012209270434266912, + "compression/movement_sparsity/linear_layer_sparsity": 0.9104882903719882, + "compression/movement_sparsity/model_sparsity": 0.8792102166081082, + "compression_loss": 104.2146987915039, + "distillation_loss": 3.38508677482605, + "epoch": 4.22, + "learning_rate": 3.209354747816287e-05, + "loss": 108.2756, + "step": 4996, + "task_loss": 1.3145182132720947 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9826242079369362, + "compression/movement_sparsity/importance_threshold": -0.00012169543989005386, + "compression/movement_sparsity/linear_layer_sparsity": 0.9105573074542647, + "compression/movement_sparsity/model_sparsity": 0.8792768627412869, + "compression_loss": 104.2201156616211, + "distillation_loss": 3.99588942527771, + "epoch": 4.22, + "learning_rate": 3.208885131962055e-05, + "loss": 108.4481, + "step": 4997, + "task_loss": 2.871309280395508 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9826808065635957, + "compression/movement_sparsity/importance_threshold": -0.00012129903811777722, + "compression/movement_sparsity/linear_layer_sparsity": 0.9106197304718383, + "compression/movement_sparsity/model_sparsity": 0.8793371413361714, + "compression_loss": 104.2254638671875, + "distillation_loss": 6.406543731689453, + "epoch": 4.22, + "learning_rate": 3.208415516107824e-05, + "loss": 109.0371, + "step": 4998, + "task_loss": 2.234088897705078 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9827372821498109, + "compression/movement_sparsity/importance_threshold": -0.00012090349808814484, + "compression/movement_sparsity/linear_layer_sparsity": 0.9105647362107018, + "compression/movement_sparsity/model_sparsity": 0.879284036297087, + "compression_loss": 104.23081970214844, + "distillation_loss": 4.646080493927002, + "epoch": 4.23, + "learning_rate": 3.2079459002535926e-05, + "loss": 108.6644, + "step": 4999, + "task_loss": 2.531714677810669 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9827936348294671, + "compression/movement_sparsity/importance_threshold": -0.00012050881886345975, + "compression/movement_sparsity/linear_layer_sparsity": 0.9105828967180112, + "compression/movement_sparsity/model_sparsity": 0.879301572935102, + "compression_loss": 104.23619079589844, + "distillation_loss": 3.7410311698913574, + "epoch": 4.23, + "learning_rate": 3.207476284399362e-05, + "loss": 107.7993, + "step": 5000, + "task_loss": 2.567991256713867 + }, + { + "epoch": 4.23, + "eval_accuracy": 0.5636039603960395, + "eval_loss": 108.08809661865234, + "eval_runtime": 225.6217, + "eval_samples_per_second": 111.913, + "eval_steps_per_second": 0.878, + "step": 5000 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9828498647364494, + "compression/movement_sparsity/importance_threshold": -0.00012011499950602585, + "compression/movement_sparsity/linear_layer_sparsity": 0.9106460948064813, + "compression/movement_sparsity/model_sparsity": 0.8793625999748131, + "compression_loss": 104.24150848388672, + "distillation_loss": 3.8410282135009766, + "epoch": 4.23, + "learning_rate": 3.20700666854513e-05, + "loss": 108.6169, + "step": 5001, + "task_loss": 1.5692440271377563 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9829059720046428, + "compression/movement_sparsity/importance_threshold": -0.00011972203907814703, + "compression/movement_sparsity/linear_layer_sparsity": 0.9107031519486188, + "compression/movement_sparsity/model_sparsity": 0.87941769702859, + "compression_loss": 104.24684143066406, + "distillation_loss": 5.2896728515625, + "epoch": 4.23, + "learning_rate": 3.2065370526908985e-05, + "loss": 108.7589, + "step": 5002, + "task_loss": 3.049318552017212 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9829619567679325, + "compression/movement_sparsity/importance_threshold": -0.00011932993664212892, + "compression/movement_sparsity/linear_layer_sparsity": 0.9107144322112024, + "compression/movement_sparsity/model_sparsity": 0.8794285897794516, + "compression_loss": 104.25211334228516, + "distillation_loss": 5.057112693786621, + "epoch": 4.23, + "learning_rate": 3.206067436836668e-05, + "loss": 108.7067, + "step": 5003, + "task_loss": 2.031747817993164 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9830178191602038, + "compression/movement_sparsity/importance_threshold": -0.00011893869126027284, + "compression/movement_sparsity/linear_layer_sparsity": 0.9107942048926863, + "compression/movement_sparsity/model_sparsity": 0.8795056220239171, + "compression_loss": 104.25740814208984, + "distillation_loss": 3.154707431793213, + "epoch": 4.23, + "learning_rate": 3.2055978209824365e-05, + "loss": 108.4804, + "step": 5004, + "task_loss": 2.0009868144989014 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9830735593153417, + "compression/movement_sparsity/importance_threshold": -0.00011854830199488353, + "compression/movement_sparsity/linear_layer_sparsity": 0.9108988036911881, + "compression/movement_sparsity/model_sparsity": 0.8796066275319068, + "compression_loss": 104.26274871826172, + "distillation_loss": 5.2205810546875, + "epoch": 4.23, + "learning_rate": 3.205128205128206e-05, + "loss": 109.1014, + "step": 5005, + "task_loss": 2.3252501487731934 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9831291773672313, + "compression/movement_sparsity/importance_threshold": -0.00011815876790826664, + "compression/movement_sparsity/linear_layer_sparsity": 0.9109258238550509, + "compression/movement_sparsity/model_sparsity": 0.8796327194700172, + "compression_loss": 104.2680435180664, + "distillation_loss": 4.400875091552734, + "epoch": 4.23, + "learning_rate": 3.204658589273974e-05, + "loss": 109.0912, + "step": 5006, + "task_loss": 3.1280903816223145 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9831846734497577, + "compression/movement_sparsity/importance_threshold": -0.00011777008806272606, + "compression/movement_sparsity/linear_layer_sparsity": 0.910920159875424, + "compression/movement_sparsity/model_sparsity": 0.8796272500655148, + "compression_loss": 104.27330017089844, + "distillation_loss": 3.95890474319458, + "epoch": 4.23, + "learning_rate": 3.204188973419743e-05, + "loss": 108.1134, + "step": 5007, + "task_loss": 1.7033382654190063 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9832400476968063, + "compression/movement_sparsity/importance_threshold": -0.0001173822615205631, + "compression/movement_sparsity/linear_layer_sparsity": 0.9109585556952114, + "compression/movement_sparsity/model_sparsity": 0.8796643268707732, + "compression_loss": 104.27851867675781, + "distillation_loss": 3.591562032699585, + "epoch": 4.23, + "learning_rate": 3.203719357565512e-05, + "loss": 108.8108, + "step": 5008, + "task_loss": 1.7163728475570679 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.983295300242262, + "compression/movement_sparsity/importance_threshold": -0.00011699528734408424, + "compression/movement_sparsity/linear_layer_sparsity": 0.9109861601432885, + "compression/movement_sparsity/model_sparsity": 0.8796909830211376, + "compression_loss": 104.2837142944336, + "distillation_loss": 5.372992992401123, + "epoch": 4.23, + "learning_rate": 3.2032497417112803e-05, + "loss": 108.8589, + "step": 5009, + "task_loss": 2.20035457611084 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.98335043122001, + "compression/movement_sparsity/importance_threshold": -0.00011660916459559252, + "compression/movement_sparsity/linear_layer_sparsity": 0.9109619660071553, + "compression/movement_sparsity/model_sparsity": 0.8796676200280105, + "compression_loss": 104.28894805908203, + "distillation_loss": 4.983962059020996, + "epoch": 4.23, + "learning_rate": 3.202780125857049e-05, + "loss": 108.7299, + "step": 5010, + "task_loss": 2.9711713790893555 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9834054407639355, + "compression/movement_sparsity/importance_threshold": -0.00011622389233739098, + "compression/movement_sparsity/linear_layer_sparsity": 0.9110587902483585, + "compression/movement_sparsity/model_sparsity": 0.8797611180586622, + "compression_loss": 104.2941665649414, + "distillation_loss": 3.4205827713012695, + "epoch": 4.24, + "learning_rate": 3.2023105100028176e-05, + "loss": 107.8804, + "step": 5011, + "task_loss": 2.3496716022491455 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9834603290079236, + "compression/movement_sparsity/importance_threshold": -0.00011583946963178524, + "compression/movement_sparsity/linear_layer_sparsity": 0.9110237451196767, + "compression/movement_sparsity/model_sparsity": 0.879727276837962, + "compression_loss": 104.29930877685547, + "distillation_loss": 4.932743072509766, + "epoch": 4.24, + "learning_rate": 3.201840894148587e-05, + "loss": 108.7288, + "step": 5012, + "task_loss": 1.6641490459442139 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9835150960858594, + "compression/movement_sparsity/importance_threshold": -0.00011545589554107921, + "compression/movement_sparsity/linear_layer_sparsity": 0.9110707144159943, + "compression/movement_sparsity/model_sparsity": 0.8797726325944567, + "compression_loss": 104.30450439453125, + "distillation_loss": 5.274240016937256, + "epoch": 4.24, + "learning_rate": 3.2013712782943556e-05, + "loss": 109.0163, + "step": 5013, + "task_loss": 2.080785036087036 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9835697421316282, + "compression/movement_sparsity/importance_threshold": -0.00011507316912757505, + "compression/movement_sparsity/linear_layer_sparsity": 0.9111329227985506, + "compression/movement_sparsity/model_sparsity": 0.8798327039276969, + "compression_loss": 104.30966186523438, + "distillation_loss": 4.168460369110107, + "epoch": 4.24, + "learning_rate": 3.200901662440124e-05, + "loss": 108.1448, + "step": 5014, + "task_loss": 2.5841617584228516 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9836242672791149, + "compression/movement_sparsity/importance_threshold": -0.00011469128945357926, + "compression/movement_sparsity/linear_layer_sparsity": 0.9111332089785739, + "compression/movement_sparsity/model_sparsity": 0.8798329802765559, + "compression_loss": 104.31482696533203, + "distillation_loss": 4.5313825607299805, + "epoch": 4.24, + "learning_rate": 3.200432046585893e-05, + "loss": 108.536, + "step": 5015, + "task_loss": 2.3273134231567383 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9836786716622047, + "compression/movement_sparsity/importance_threshold": -0.00011431025558139487, + "compression/movement_sparsity/linear_layer_sparsity": 0.9110947654621159, + "compression/movement_sparsity/model_sparsity": 0.8797958574131542, + "compression_loss": 104.31990814208984, + "distillation_loss": 4.947587490081787, + "epoch": 4.24, + "learning_rate": 3.1999624307316615e-05, + "loss": 108.8537, + "step": 5016, + "task_loss": 2.730464458465576 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9837329554147829, + "compression/movement_sparsity/importance_threshold": -0.00011393006657332491, + "compression/movement_sparsity/linear_layer_sparsity": 0.9111604437774542, + "compression/movement_sparsity/model_sparsity": 0.8798592794763106, + "compression_loss": 104.32501220703125, + "distillation_loss": 5.169883728027344, + "epoch": 4.24, + "learning_rate": 3.199492814877431e-05, + "loss": 108.8401, + "step": 5017, + "task_loss": 2.4348621368408203 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9837871186707345, + "compression/movement_sparsity/importance_threshold": -0.00011355072149167415, + "compression/movement_sparsity/linear_layer_sparsity": 0.9112713266123001, + "compression/movement_sparsity/model_sparsity": 0.8799663531446641, + "compression_loss": 104.33008575439453, + "distillation_loss": 4.420599937438965, + "epoch": 4.24, + "learning_rate": 3.199023199023199e-05, + "loss": 109.0665, + "step": 5018, + "task_loss": 3.029005289077759 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9838411615639447, + "compression/movement_sparsity/importance_threshold": -0.00011317221939874649, + "compression/movement_sparsity/linear_layer_sparsity": 0.9112816410173051, + "compression/movement_sparsity/model_sparsity": 0.8799763132181263, + "compression_loss": 104.33518981933594, + "distillation_loss": 4.387458801269531, + "epoch": 4.24, + "learning_rate": 3.198553583168968e-05, + "loss": 109.2601, + "step": 5019, + "task_loss": 3.3819122314453125 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9838950842282987, + "compression/movement_sparsity/importance_threshold": -0.00011279455935684496, + "compression/movement_sparsity/linear_layer_sparsity": 0.9113776782634444, + "compression/movement_sparsity/model_sparsity": 0.8800690512894156, + "compression_loss": 104.34021759033203, + "distillation_loss": 5.056623935699463, + "epoch": 4.24, + "learning_rate": 3.198083967314737e-05, + "loss": 108.4819, + "step": 5020, + "task_loss": 2.787040948867798 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9839488867976816, + "compression/movement_sparsity/importance_threshold": -0.00011241774042827519, + "compression/movement_sparsity/linear_layer_sparsity": 0.9113959937849331, + "compression/movement_sparsity/model_sparsity": 0.880086737616396, + "compression_loss": 104.34530639648438, + "distillation_loss": 5.043505668640137, + "epoch": 4.24, + "learning_rate": 3.197614351460505e-05, + "loss": 108.7554, + "step": 5021, + "task_loss": 2.578838586807251 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9840025694059783, + "compression/movement_sparsity/importance_threshold": -0.00011204176167534108, + "compression/movement_sparsity/linear_layer_sparsity": 0.9113515881846571, + "compression/movement_sparsity/model_sparsity": 0.8800438574850971, + "compression_loss": 104.35037994384766, + "distillation_loss": 3.8916447162628174, + "epoch": 4.24, + "learning_rate": 3.1971447356062746e-05, + "loss": 108.6567, + "step": 5022, + "task_loss": 2.1005728244781494 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9840561321870743, + "compression/movement_sparsity/importance_threshold": -0.00011166662216034479, + "compression/movement_sparsity/linear_layer_sparsity": 0.9113290038111548, + "compression/movement_sparsity/model_sparsity": 0.8800220489543022, + "compression_loss": 104.35537719726562, + "distillation_loss": 4.1908464431762695, + "epoch": 4.25, + "learning_rate": 3.1966751197520426e-05, + "loss": 108.3442, + "step": 5023, + "task_loss": 1.8488974571228027 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9841095752748545, + "compression/movement_sparsity/importance_threshold": -0.00011129232094559197, + "compression/movement_sparsity/linear_layer_sparsity": 0.911314325160795, + "compression/movement_sparsity/model_sparsity": 0.8800078745607391, + "compression_loss": 104.3603515625, + "distillation_loss": 4.80531120300293, + "epoch": 4.25, + "learning_rate": 3.196205503897812e-05, + "loss": 108.5211, + "step": 5024, + "task_loss": 2.7543392181396484 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9841628988032042, + "compression/movement_sparsity/importance_threshold": -0.00011091885709338563, + "compression/movement_sparsity/linear_layer_sparsity": 0.9113686755168793, + "compression/movement_sparsity/model_sparsity": 0.8800603578148907, + "compression_loss": 104.3653335571289, + "distillation_loss": 5.396255970001221, + "epoch": 4.25, + "learning_rate": 3.1957358880435805e-05, + "loss": 108.6167, + "step": 5025, + "task_loss": 3.078479528427124 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9842161029060085, + "compression/movement_sparsity/importance_threshold": -0.00011054622966603055, + "compression/movement_sparsity/linear_layer_sparsity": 0.9113932154538739, + "compression/movement_sparsity/model_sparsity": 0.8800840547295559, + "compression_loss": 104.37031555175781, + "distillation_loss": 4.5727858543396, + "epoch": 4.25, + "learning_rate": 3.195266272189349e-05, + "loss": 108.5016, + "step": 5026, + "task_loss": 2.9454221725463867 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9842691877171524, + "compression/movement_sparsity/importance_threshold": -0.00011017443772582975, + "compression/movement_sparsity/linear_layer_sparsity": 0.9114416991194813, + "compression/movement_sparsity/model_sparsity": 0.8801308728320965, + "compression_loss": 104.37532806396484, + "distillation_loss": 3.7028183937072754, + "epoch": 4.25, + "learning_rate": 3.194796656335118e-05, + "loss": 107.6632, + "step": 5027, + "task_loss": 2.0604896545410156 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9843221533705212, + "compression/movement_sparsity/importance_threshold": -0.00010980348033508888, + "compression/movement_sparsity/linear_layer_sparsity": 0.9114545056755222, + "compression/movement_sparsity/model_sparsity": 0.8801432394435398, + "compression_loss": 104.3802490234375, + "distillation_loss": 6.112961769104004, + "epoch": 4.25, + "learning_rate": 3.1943270404808864e-05, + "loss": 108.7897, + "step": 5028, + "task_loss": 3.3483402729034424 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.984375, + "compression/movement_sparsity/importance_threshold": -0.00010943335655611008, + "compression/movement_sparsity/linear_layer_sparsity": 0.9113830322147128, + "compression/movement_sparsity/model_sparsity": 0.8800742213159873, + "compression_loss": 104.38521575927734, + "distillation_loss": 5.418972969055176, + "epoch": 4.25, + "learning_rate": 3.193857424626656e-05, + "loss": 108.7136, + "step": 5029, + "task_loss": 2.8751347064971924 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.984427727739474, + "compression/movement_sparsity/importance_threshold": -0.00010906406545119814, + "compression/movement_sparsity/linear_layer_sparsity": 0.9114651062605506, + "compression/movement_sparsity/model_sparsity": 0.8801534758658611, + "compression_loss": 104.39014434814453, + "distillation_loss": 3.4208569526672363, + "epoch": 4.25, + "learning_rate": 3.1933878087724244e-05, + "loss": 108.6336, + "step": 5030, + "task_loss": 1.2710500955581665 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9844803367228281, + "compression/movement_sparsity/importance_threshold": -0.00010869560608265782, + "compression/movement_sparsity/linear_layer_sparsity": 0.9115104300217344, + "compression/movement_sparsity/model_sparsity": 0.8801972426164162, + "compression_loss": 104.3951187133789, + "distillation_loss": 4.023708343505859, + "epoch": 4.25, + "learning_rate": 3.192918192918193e-05, + "loss": 108.7203, + "step": 5031, + "task_loss": 2.1587107181549072 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9845328270839477, + "compression/movement_sparsity/importance_threshold": -0.0001083279775127904, + "compression/movement_sparsity/linear_layer_sparsity": 0.9114767561723307, + "compression/movement_sparsity/model_sparsity": 0.8801647255673324, + "compression_loss": 104.40001678466797, + "distillation_loss": 5.2169647216796875, + "epoch": 4.25, + "learning_rate": 3.1924485770639616e-05, + "loss": 109.0389, + "step": 5032, + "task_loss": 2.93900465965271 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9845851989567178, + "compression/movement_sparsity/importance_threshold": -0.00010796117880390414, + "compression/movement_sparsity/linear_layer_sparsity": 0.9115485873581691, + "compression/movement_sparsity/model_sparsity": 0.8802340891309587, + "compression_loss": 104.40501403808594, + "distillation_loss": 3.410867691040039, + "epoch": 4.25, + "learning_rate": 3.191978961209731e-05, + "loss": 108.1407, + "step": 5033, + "task_loss": 2.0547780990600586 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9846374524750237, + "compression/movement_sparsity/importance_threshold": -0.00010759520901829946, + "compression/movement_sparsity/linear_layer_sparsity": 0.9115758340812171, + "compression/movement_sparsity/model_sparsity": 0.8802603998452493, + "compression_loss": 104.4099349975586, + "distillation_loss": 4.657121658325195, + "epoch": 4.26, + "learning_rate": 3.1915093453554996e-05, + "loss": 109.0712, + "step": 5034, + "task_loss": 2.269298791885376 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9846895877727504, + "compression/movement_sparsity/importance_threshold": -0.00010723006721828025, + "compression/movement_sparsity/linear_layer_sparsity": 0.9115134945328168, + "compression/movement_sparsity/model_sparsity": 0.8802002018521153, + "compression_loss": 104.41484069824219, + "distillation_loss": 3.967438220977783, + "epoch": 4.26, + "learning_rate": 3.1910397295012675e-05, + "loss": 107.588, + "step": 5035, + "task_loss": 1.4696491956710815 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.984741604983783, + "compression/movement_sparsity/importance_threshold": -0.00010686575246615301, + "compression/movement_sparsity/linear_layer_sparsity": 0.9114568785848818, + "compression/movement_sparsity/model_sparsity": 0.8801455308361629, + "compression_loss": 104.41972351074219, + "distillation_loss": 4.29909086227417, + "epoch": 4.26, + "learning_rate": 3.190570113647037e-05, + "loss": 109.0746, + "step": 5036, + "task_loss": 2.5707712173461914 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9847935042420068, + "compression/movement_sparsity/importance_threshold": -0.00010650226382421992, + "compression/movement_sparsity/linear_layer_sparsity": 0.9115578882089251, + "compression/movement_sparsity/model_sparsity": 0.8802430704688785, + "compression_loss": 104.42462921142578, + "distillation_loss": 4.294763565063477, + "epoch": 4.26, + "learning_rate": 3.1901004977928055e-05, + "loss": 108.8264, + "step": 5037, + "task_loss": 2.1533334255218506 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9848452856813068, + "compression/movement_sparsity/importance_threshold": -0.0001061396003547866, + "compression/movement_sparsity/linear_layer_sparsity": 0.9115662232021027, + "compression/movement_sparsity/model_sparsity": 0.8802511191293988, + "compression_loss": 104.42950439453125, + "distillation_loss": 4.899670600891113, + "epoch": 4.26, + "learning_rate": 3.189630881938575e-05, + "loss": 109.0155, + "step": 5038, + "task_loss": 2.5415704250335693 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9848969494355683, + "compression/movement_sparsity/importance_threshold": -0.00010577776112015436, + "compression/movement_sparsity/linear_layer_sparsity": 0.9115838471218685, + "compression/movement_sparsity/model_sparsity": 0.8802681376133031, + "compression_loss": 104.43428802490234, + "distillation_loss": 5.118988990783691, + "epoch": 4.26, + "learning_rate": 3.1891612660843434e-05, + "loss": 108.9186, + "step": 5039, + "task_loss": 2.8862574100494385 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9849484956386761, + "compression/movement_sparsity/importance_threshold": -0.00010541674518263055, + "compression/movement_sparsity/linear_layer_sparsity": 0.9116718594031887, + "compression/movement_sparsity/model_sparsity": 0.8803531264020027, + "compression_loss": 104.4391098022461, + "distillation_loss": 4.8668107986450195, + "epoch": 4.26, + "learning_rate": 3.188691650230112e-05, + "loss": 109.0847, + "step": 5040, + "task_loss": 1.5400503873825073 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9849999244245158, + "compression/movement_sparsity/importance_threshold": -0.00010505655160451561, + "compression/movement_sparsity/linear_layer_sparsity": 0.9117413176796676, + "compression/movement_sparsity/model_sparsity": 0.8804201985730059, + "compression_loss": 104.4439697265625, + "distillation_loss": 4.153745651245117, + "epoch": 4.26, + "learning_rate": 3.188222034375881e-05, + "loss": 108.4479, + "step": 5041, + "task_loss": 1.8383405208587646 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9850512359269723, + "compression/movement_sparsity/importance_threshold": -0.00010469717944811604, + "compression/movement_sparsity/linear_layer_sparsity": 0.9117762793391759, + "compression/movement_sparsity/model_sparsity": 0.8804539591919555, + "compression_loss": 104.44877624511719, + "distillation_loss": 4.911004066467285, + "epoch": 4.26, + "learning_rate": 3.187752418521649e-05, + "loss": 108.5196, + "step": 5042, + "task_loss": 2.4689230918884277 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9851024302799307, + "compression/movement_sparsity/importance_threshold": -0.00010433862777573487, + "compression/movement_sparsity/linear_layer_sparsity": 0.9117506304545913, + "compression/movement_sparsity/model_sparsity": 0.8804291914254614, + "compression_loss": 104.45355224609375, + "distillation_loss": 2.248910427093506, + "epoch": 4.26, + "learning_rate": 3.1872828026674186e-05, + "loss": 108.1036, + "step": 5043, + "task_loss": 0.9459171295166016 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9851535076172762, + "compression/movement_sparsity/importance_threshold": -0.000103980895649676, + "compression/movement_sparsity/linear_layer_sparsity": 0.9117966338933304, + "compression/movement_sparsity/model_sparsity": 0.8804736145045567, + "compression_loss": 104.4583511352539, + "distillation_loss": 5.878549575805664, + "epoch": 4.26, + "learning_rate": 3.1868131868131866e-05, + "loss": 108.4979, + "step": 5044, + "task_loss": 2.47756028175354 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.985204468072894, + "compression/movement_sparsity/importance_threshold": -0.00010362398213224332, + "compression/movement_sparsity/linear_layer_sparsity": 0.9118314643869948, + "compression/movement_sparsity/model_sparsity": 0.8805072484636126, + "compression_loss": 104.46318054199219, + "distillation_loss": 5.743114471435547, + "epoch": 4.26, + "learning_rate": 3.186343570958956e-05, + "loss": 108.7096, + "step": 5045, + "task_loss": 3.053860902786255 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.985255311780669, + "compression/movement_sparsity/importance_threshold": -0.00010326788628574248, + "compression/movement_sparsity/linear_layer_sparsity": 0.9118860413022641, + "compression/movement_sparsity/model_sparsity": 0.8805599504939442, + "compression_loss": 104.46797943115234, + "distillation_loss": 4.399833679199219, + "epoch": 4.27, + "learning_rate": 3.1858739551047245e-05, + "loss": 108.348, + "step": 5046, + "task_loss": 2.8133544921875 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9853060388744866, + "compression/movement_sparsity/importance_threshold": -0.00010291260717247477, + "compression/movement_sparsity/linear_layer_sparsity": 0.9119109151159526, + "compression/movement_sparsity/model_sparsity": 0.8805839698156116, + "compression_loss": 104.47276306152344, + "distillation_loss": 3.8360495567321777, + "epoch": 4.27, + "learning_rate": 3.185404339250493e-05, + "loss": 108.1954, + "step": 5047, + "task_loss": 2.0244178771972656 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.985356649488232, + "compression/movement_sparsity/importance_threshold": -0.00010255814385474581, + "compression/movement_sparsity/linear_layer_sparsity": 0.9119037129187004, + "compression/movement_sparsity/model_sparsity": 0.8805770150359917, + "compression_loss": 104.47752380371094, + "distillation_loss": 4.759662628173828, + "epoch": 4.27, + "learning_rate": 3.184934723396262e-05, + "loss": 108.6321, + "step": 5048, + "task_loss": 3.2276015281677246 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9854071437557902, + "compression/movement_sparsity/importance_threshold": -0.00010220449539485865, + "compression/movement_sparsity/linear_layer_sparsity": 0.9120151919619282, + "compression/movement_sparsity/model_sparsity": 0.8806846644311349, + "compression_loss": 104.48231506347656, + "distillation_loss": 4.029386520385742, + "epoch": 4.27, + "learning_rate": 3.1844651075420304e-05, + "loss": 108.9369, + "step": 5049, + "task_loss": 1.7876924276351929 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9854575218110462, + "compression/movement_sparsity/importance_threshold": -0.00010185166085511892, + "compression/movement_sparsity/linear_layer_sparsity": 0.9121171078227118, + "compression/movement_sparsity/model_sparsity": 0.8807830791685708, + "compression_loss": 104.48702239990234, + "distillation_loss": 5.76923942565918, + "epoch": 4.27, + "learning_rate": 3.1839954916878e-05, + "loss": 108.7576, + "step": 5050, + "task_loss": 2.215848207473755 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9855077837878853, + "compression/movement_sparsity/importance_threshold": -0.00010149963929782964, + "compression/movement_sparsity/linear_layer_sparsity": 0.912057963951238, + "compression/movement_sparsity/model_sparsity": 0.8807259670710299, + "compression_loss": 104.4917984008789, + "distillation_loss": 4.154844760894775, + "epoch": 4.27, + "learning_rate": 3.1835258758335684e-05, + "loss": 108.6595, + "step": 5051, + "task_loss": 2.0954430103302 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9855579298201926, + "compression/movement_sparsity/importance_threshold": -0.00010114842978529472, + "compression/movement_sparsity/linear_layer_sparsity": 0.9120699835122149, + "compression/movement_sparsity/model_sparsity": 0.8807375737231108, + "compression_loss": 104.49649810791016, + "distillation_loss": 4.78985595703125, + "epoch": 4.27, + "learning_rate": 3.183056259979337e-05, + "loss": 108.5691, + "step": 5052, + "task_loss": 2.24904465675354 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9856079600418534, + "compression/movement_sparsity/importance_threshold": -0.00010079803137981718, + "compression/movement_sparsity/linear_layer_sparsity": 0.9121093809620838, + "compression/movement_sparsity/model_sparsity": 0.880775617749376, + "compression_loss": 104.5011978149414, + "distillation_loss": 5.814626693725586, + "epoch": 4.27, + "learning_rate": 3.1825866441251057e-05, + "loss": 109.4795, + "step": 5053, + "task_loss": 3.0940518379211426 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9856578745867526, + "compression/movement_sparsity/importance_threshold": -0.00010044844314370267, + "compression/movement_sparsity/linear_layer_sparsity": 0.912155122069135, + "compression/movement_sparsity/model_sparsity": 0.8808197875086838, + "compression_loss": 104.50589752197266, + "distillation_loss": 4.1859049797058105, + "epoch": 4.27, + "learning_rate": 3.182117028270874e-05, + "loss": 108.9171, + "step": 5054, + "task_loss": 1.856653094291687 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9857076735887755, + "compression/movement_sparsity/importance_threshold": -0.00010009966413925334, + "compression/movement_sparsity/linear_layer_sparsity": 0.9121741411165142, + "compression/movement_sparsity/model_sparsity": 0.8808381531932761, + "compression_loss": 104.51061248779297, + "distillation_loss": 4.932311534881592, + "epoch": 4.27, + "learning_rate": 3.1816474124166436e-05, + "loss": 109.2564, + "step": 5055, + "task_loss": 2.129056930541992 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9857573571818071, + "compression/movement_sparsity/importance_threshold": -9.975169342877569e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9121804132286907, + "compression/movement_sparsity/model_sparsity": 0.880844209839104, + "compression_loss": 104.51529693603516, + "distillation_loss": 4.652114391326904, + "epoch": 4.27, + "learning_rate": 3.181177796562412e-05, + "loss": 108.4885, + "step": 5056, + "task_loss": 2.521817207336426 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9858069254997327, + "compression/movement_sparsity/importance_threshold": -9.94045300745719e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.912164911810764, + "compression/movement_sparsity/model_sparsity": 0.8808292409425712, + "compression_loss": 104.51992797851562, + "distillation_loss": 4.611353874206543, + "epoch": 4.27, + "learning_rate": 3.180708180708181e-05, + "loss": 108.9046, + "step": 5057, + "task_loss": 2.739943742752075 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9858563786764374, + "compression/movement_sparsity/importance_threshold": -9.905817313894585e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9122902944334551, + "compression/movement_sparsity/model_sparsity": 0.8809503162864507, + "compression_loss": 104.52462768554688, + "distillation_loss": 3.9913489818573, + "epoch": 4.28, + "learning_rate": 3.1802385648539495e-05, + "loss": 108.7195, + "step": 5058, + "task_loss": 1.3560429811477661 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9859057168458063, + "compression/movement_sparsity/importance_threshold": -9.871262168420231e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9123255303488191, + "compression/movement_sparsity/model_sparsity": 0.8809843417397236, + "compression_loss": 104.52928161621094, + "distillation_loss": 4.735760688781738, + "epoch": 4.28, + "learning_rate": 3.179768948999718e-05, + "loss": 108.7523, + "step": 5059, + "task_loss": 2.7767319679260254 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9859549401417245, + "compression/movement_sparsity/importance_threshold": -9.836787477264519e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9123383488290276, + "compression/movement_sparsity/model_sparsity": 0.8809967198657027, + "compression_loss": 104.53387451171875, + "distillation_loss": 3.8069205284118652, + "epoch": 4.28, + "learning_rate": 3.1792993331454874e-05, + "loss": 108.7702, + "step": 5060, + "task_loss": 2.062389850616455 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9860040486980772, + "compression/movement_sparsity/importance_threshold": -9.802393146657751e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9123520377734736, + "compression/movement_sparsity/model_sparsity": 0.8810099385527949, + "compression_loss": 104.53852844238281, + "distillation_loss": 4.421704292297363, + "epoch": 4.28, + "learning_rate": 3.1788297172912554e-05, + "loss": 108.9536, + "step": 5061, + "task_loss": 2.2816669940948486 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9860530426487495, + "compression/movement_sparsity/importance_threshold": -9.768079082830491e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9123355466496332, + "compression/movement_sparsity/model_sparsity": 0.880994013949791, + "compression_loss": 104.54315185546875, + "distillation_loss": 4.5552802085876465, + "epoch": 4.28, + "learning_rate": 3.178360101437025e-05, + "loss": 109.0314, + "step": 5062, + "task_loss": 2.2028534412384033 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9861019221276266, + "compression/movement_sparsity/importance_threshold": -9.733845192012956e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9124995039546263, + "compression/movement_sparsity/model_sparsity": 0.881152338816966, + "compression_loss": 104.5477294921875, + "distillation_loss": 2.74337100982666, + "epoch": 4.28, + "learning_rate": 3.1778904855827933e-05, + "loss": 108.8639, + "step": 5063, + "task_loss": 1.8484052419662476 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9861506872685937, + "compression/movement_sparsity/importance_threshold": -9.69969138043562e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9125077435544627, + "compression/movement_sparsity/model_sparsity": 0.8811602953612, + "compression_loss": 104.55232238769531, + "distillation_loss": 5.002686977386475, + "epoch": 4.28, + "learning_rate": 3.1774208697285627e-05, + "loss": 109.1945, + "step": 5064, + "task_loss": 2.819601058959961 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9861993382055357, + "compression/movement_sparsity/importance_threshold": -9.665617554328963e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9125325338989777, + "compression/movement_sparsity/model_sparsity": 0.8811842340811168, + "compression_loss": 104.55687713623047, + "distillation_loss": 5.790381908416748, + "epoch": 4.28, + "learning_rate": 3.1769512538743306e-05, + "loss": 108.9055, + "step": 5065, + "task_loss": 2.860867500305176 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9862478750723379, + "compression/movement_sparsity/importance_threshold": -9.631623619923285e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9126209277536623, + "compression/movement_sparsity/model_sparsity": 0.8812695913349617, + "compression_loss": 104.56140899658203, + "distillation_loss": 4.94313907623291, + "epoch": 4.28, + "learning_rate": 3.176481638020099e-05, + "loss": 108.8471, + "step": 5066, + "task_loss": 3.8895223140716553 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9862962980028855, + "compression/movement_sparsity/importance_threshold": -9.597709483448978e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9126741572379888, + "compression/movement_sparsity/model_sparsity": 0.8813209922227486, + "compression_loss": 104.56596374511719, + "distillation_loss": 4.835390090942383, + "epoch": 4.28, + "learning_rate": 3.1760120221658686e-05, + "loss": 108.7852, + "step": 5067, + "task_loss": 2.5026934146881104 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9863446071310636, + "compression/movement_sparsity/importance_threshold": -9.563875051136518e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9127214961835032, + "compression/movement_sparsity/model_sparsity": 0.8813667049298529, + "compression_loss": 104.57048034667969, + "distillation_loss": 3.897519826889038, + "epoch": 4.28, + "learning_rate": 3.175542406311637e-05, + "loss": 108.6235, + "step": 5068, + "task_loss": 2.6005992889404297 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9863928025907573, + "compression/movement_sparsity/importance_threshold": -9.530120229216208e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.912723272884481, + "compression/movement_sparsity/model_sparsity": 0.8813684205956863, + "compression_loss": 104.57502746582031, + "distillation_loss": 4.4903669357299805, + "epoch": 4.28, + "learning_rate": 3.1750727904574065e-05, + "loss": 108.8103, + "step": 5069, + "task_loss": 2.770446300506592 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9864408845158518, + "compression/movement_sparsity/importance_threshold": -9.496444923918525e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9127726270143258, + "compression/movement_sparsity/model_sparsity": 0.88141607925934, + "compression_loss": 104.57952117919922, + "distillation_loss": 4.977445125579834, + "epoch": 4.29, + "learning_rate": 3.1746031746031745e-05, + "loss": 108.9538, + "step": 5070, + "task_loss": 3.13754940032959 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.986488853040232, + "compression/movement_sparsity/importance_threshold": -9.462849041473858e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9128232093334371, + "compression/movement_sparsity/model_sparsity": 0.8814649239201804, + "compression_loss": 104.583984375, + "distillation_loss": 4.58910608291626, + "epoch": 4.29, + "learning_rate": 3.174133558748944e-05, + "loss": 108.6387, + "step": 5071, + "task_loss": 2.6777477264404297 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9865367082977834, + "compression/movement_sparsity/importance_threshold": -9.429332488112598e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9128593872580443, + "compression/movement_sparsity/model_sparsity": 0.881499859021781, + "compression_loss": 104.58843994140625, + "distillation_loss": 7.378966331481934, + "epoch": 4.29, + "learning_rate": 3.1736639428947124e-05, + "loss": 109.2171, + "step": 5072, + "task_loss": 3.9182486534118652 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.986584450422391, + "compression/movement_sparsity/importance_threshold": -9.395895170065047e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9128690339096618, + "compression/movement_sparsity/model_sparsity": 0.8815091742812388, + "compression_loss": 104.59294891357422, + "distillation_loss": 4.1890869140625, + "epoch": 4.29, + "learning_rate": 3.173194327040481e-05, + "loss": 108.7325, + "step": 5073, + "task_loss": 2.506502628326416 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9866320795479399, + "compression/movement_sparsity/importance_threshold": -9.362536993561682e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9129192942762469, + "compression/movement_sparsity/model_sparsity": 0.8815577080496128, + "compression_loss": 104.597412109375, + "distillation_loss": 5.296530723571777, + "epoch": 4.29, + "learning_rate": 3.17272471118625e-05, + "loss": 108.9708, + "step": 5074, + "task_loss": 4.015791893005371 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9866795958083151, + "compression/movement_sparsity/importance_threshold": -9.32925786483298e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9129804414078836, + "compression/movement_sparsity/model_sparsity": 0.8816167545891672, + "compression_loss": 104.60186004638672, + "distillation_loss": 4.187299728393555, + "epoch": 4.29, + "learning_rate": 3.172255095332018e-05, + "loss": 109.1803, + "step": 5075, + "task_loss": 2.220911979675293 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.986726999337402, + "compression/movement_sparsity/importance_threshold": -9.296057690109244e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9129801790761957, + "compression/movement_sparsity/model_sparsity": 0.8816165012693797, + "compression_loss": 104.60629272460938, + "distillation_loss": 4.564126491546631, + "epoch": 4.29, + "learning_rate": 3.1717854794777876e-05, + "loss": 108.3006, + "step": 5076, + "task_loss": 1.6244584321975708 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9867742902690857, + "compression/movement_sparsity/importance_threshold": -9.262936375620864e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9130673566657814, + "compression/movement_sparsity/model_sparsity": 0.8817006840405736, + "compression_loss": 104.6106948852539, + "distillation_loss": 4.110866069793701, + "epoch": 4.29, + "learning_rate": 3.171315863623556e-05, + "loss": 108.7214, + "step": 5077, + "task_loss": 1.8795019388198853 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9868214687372513, + "compression/movement_sparsity/importance_threshold": -9.229893827598229e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9130596298051534, + "compression/movement_sparsity/model_sparsity": 0.8816932226213787, + "compression_loss": 104.61506652832031, + "distillation_loss": 3.1517021656036377, + "epoch": 4.29, + "learning_rate": 3.170846247769325e-05, + "loss": 108.7865, + "step": 5078, + "task_loss": 1.1313797235488892 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9868685348757839, + "compression/movement_sparsity/importance_threshold": -9.196929952271816e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9131114880102017, + "compression/movement_sparsity/model_sparsity": 0.8817432993375491, + "compression_loss": 104.61946868896484, + "distillation_loss": 4.865270614624023, + "epoch": 4.29, + "learning_rate": 3.1703766319150935e-05, + "loss": 109.3787, + "step": 5079, + "task_loss": 2.8251864910125732 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9869154888185686, + "compression/movement_sparsity/importance_threshold": -9.16404465587193e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.913124282642075, + "compression/movement_sparsity/model_sparsity": 0.8817556544344567, + "compression_loss": 104.62379455566406, + "distillation_loss": 3.9382247924804688, + "epoch": 4.29, + "learning_rate": 3.169907016060862e-05, + "loss": 108.4242, + "step": 5080, + "task_loss": 1.8389393091201782 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9869623306994906, + "compression/movement_sparsity/importance_threshold": -9.131237844629131e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9131426935569048, + "compression/movement_sparsity/model_sparsity": 0.8817734328777235, + "compression_loss": 104.62818908691406, + "distillation_loss": 3.9619786739349365, + "epoch": 4.29, + "learning_rate": 3.1694374002066315e-05, + "loss": 108.7531, + "step": 5081, + "task_loss": 2.127014398574829 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9870090606524351, + "compression/movement_sparsity/importance_threshold": -9.098509424773551e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.913176319709638, + "compression/movement_sparsity/model_sparsity": 0.8818059038686641, + "compression_loss": 104.6324691772461, + "distillation_loss": 4.5122785568237305, + "epoch": 4.3, + "learning_rate": 3.1689677843524e-05, + "loss": 109.0548, + "step": 5082, + "task_loss": 2.526580333709717 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9870556788112871, + "compression/movement_sparsity/importance_threshold": -9.065859302535839e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9131126327302947, + "compression/movement_sparsity/model_sparsity": 0.8817444047329854, + "compression_loss": 104.63672637939453, + "distillation_loss": 4.357398986816406, + "epoch": 4.3, + "learning_rate": 3.168498168498169e-05, + "loss": 109.243, + "step": 5083, + "task_loss": 2.2652125358581543 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9871021853099319, + "compression/movement_sparsity/importance_threshold": -9.033287384146299e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9131174143215167, + "compression/movement_sparsity/model_sparsity": 0.881749022061839, + "compression_loss": 104.64106750488281, + "distillation_loss": 4.146395683288574, + "epoch": 4.3, + "learning_rate": 3.1680285526439374e-05, + "loss": 108.6691, + "step": 5084, + "task_loss": 1.9048006534576416 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9871485802822545, + "compression/movement_sparsity/importance_threshold": -9.00079357583532e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9131353482696412, + "compression/movement_sparsity/model_sparsity": 0.881766339923674, + "compression_loss": 104.64531707763672, + "distillation_loss": 3.0356369018554688, + "epoch": 4.3, + "learning_rate": 3.167558936789706e-05, + "loss": 108.205, + "step": 5085, + "task_loss": 1.0099021196365356 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9871948638621402, + "compression/movement_sparsity/importance_threshold": -8.968377783833206e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.913215156723628, + "compression/movement_sparsity/model_sparsity": 0.8818434067117469, + "compression_loss": 104.64957427978516, + "distillation_loss": 4.1330718994140625, + "epoch": 4.3, + "learning_rate": 3.167089320935475e-05, + "loss": 108.451, + "step": 5086, + "task_loss": 2.4595046043395996 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9872410361834739, + "compression/movement_sparsity/importance_threshold": -8.936039914370607e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9132108044024408, + "compression/movement_sparsity/model_sparsity": 0.8818392039061819, + "compression_loss": 104.65386962890625, + "distillation_loss": 4.441989898681641, + "epoch": 4.3, + "learning_rate": 3.166619705081243e-05, + "loss": 108.7017, + "step": 5087, + "task_loss": 3.139507293701172 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.987287097380141, + "compression/movement_sparsity/importance_threshold": -8.903779873677652e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9132372522062572, + "compression/movement_sparsity/model_sparsity": 0.8818647431465741, + "compression_loss": 104.65805053710938, + "distillation_loss": 5.655196189880371, + "epoch": 4.3, + "learning_rate": 3.1661500892270126e-05, + "loss": 109.1403, + "step": 5088, + "task_loss": 3.0189309120178223 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9873330475860265, + "compression/movement_sparsity/importance_threshold": -8.871597567984904e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9132284283222066, + "compression/movement_sparsity/model_sparsity": 0.8818562223900862, + "compression_loss": 104.66229248046875, + "distillation_loss": 5.313323020935059, + "epoch": 4.3, + "learning_rate": 3.165680473372781e-05, + "loss": 108.5996, + "step": 5089, + "task_loss": 2.9535226821899414 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9873788869350155, + "compression/movement_sparsity/importance_threshold": -8.839492903522754e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9132072271521501, + "compression/movement_sparsity/model_sparsity": 0.8818357495454435, + "compression_loss": 104.66650390625, + "distillation_loss": 7.070138931274414, + "epoch": 4.3, + "learning_rate": 3.16521085751855e-05, + "loss": 109.2462, + "step": 5090, + "task_loss": 4.008217811584473 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9874246155609934, + "compression/movement_sparsity/importance_threshold": -8.807465786521418e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.913269185127186, + "compression/movement_sparsity/model_sparsity": 0.881895579073432, + "compression_loss": 104.67070770263672, + "distillation_loss": 4.5764055252075195, + "epoch": 4.3, + "learning_rate": 3.1647412416643185e-05, + "loss": 108.8554, + "step": 5091, + "task_loss": 2.3671629428863525 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9874702335978449, + "compression/movement_sparsity/importance_threshold": -8.775516123211546e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9132949651776148, + "compression/movement_sparsity/model_sparsity": 0.8819204734998198, + "compression_loss": 104.67491912841797, + "distillation_loss": 3.8762803077697754, + "epoch": 4.3, + "learning_rate": 3.164271625810087e-05, + "loss": 108.9501, + "step": 5092, + "task_loss": 1.462957501411438 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9875157411794555, + "compression/movement_sparsity/importance_threshold": -8.74364381982344e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9133516765188909, + "compression/movement_sparsity/model_sparsity": 0.8819752366320586, + "compression_loss": 104.67914581298828, + "distillation_loss": 3.62357234954834, + "epoch": 4.3, + "learning_rate": 3.1638020099558564e-05, + "loss": 108.8851, + "step": 5093, + "task_loss": 2.4960060119628906 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9875611384397103, + "compression/movement_sparsity/importance_threshold": -8.711848782587406e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9134001363361631, + "compression/movement_sparsity/model_sparsity": 0.8820220317055276, + "compression_loss": 104.68325805664062, + "distillation_loss": 5.020461559295654, + "epoch": 4.31, + "learning_rate": 3.163332394101625e-05, + "loss": 108.5681, + "step": 5094, + "task_loss": 2.926961898803711 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9876064255124943, + "compression/movement_sparsity/importance_threshold": -8.680130917733831e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9133786370619156, + "compression/movement_sparsity/model_sparsity": 0.88200127099749, + "compression_loss": 104.68740844726562, + "distillation_loss": 4.894475936889648, + "epoch": 4.31, + "learning_rate": 3.162862778247394e-05, + "loss": 108.452, + "step": 5095, + "task_loss": 2.347371816635132 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9876516025316927, + "compression/movement_sparsity/importance_threshold": -8.648490131493367e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9134246405006547, + "compression/movement_sparsity/model_sparsity": 0.8820456940765854, + "compression_loss": 104.69146728515625, + "distillation_loss": 4.767604827880859, + "epoch": 4.31, + "learning_rate": 3.162393162393162e-05, + "loss": 108.6651, + "step": 5096, + "task_loss": 3.22982120513916 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9876966696311907, + "compression/movement_sparsity/importance_threshold": -8.616926330096142e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9134568119049363, + "compression/movement_sparsity/model_sparsity": 0.882076760294159, + "compression_loss": 104.6956558227539, + "distillation_loss": 5.728018760681152, + "epoch": 4.31, + "learning_rate": 3.161923546538931e-05, + "loss": 109.4553, + "step": 5097, + "task_loss": 3.0409679412841797 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9877416269448734, + "compression/movement_sparsity/importance_threshold": -8.585439419772634e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9135070961198568, + "compression/movement_sparsity/model_sparsity": 0.8821253170916047, + "compression_loss": 104.69977569580078, + "distillation_loss": 3.09999418258667, + "epoch": 4.31, + "learning_rate": 3.1614539306847e-05, + "loss": 109.3704, + "step": 5098, + "task_loss": 1.7193666696548462 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9877864746066259, + "compression/movement_sparsity/importance_threshold": -8.554029306753318e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9135459331338468, + "compression/movement_sparsity/model_sparsity": 0.8821628199346875, + "compression_loss": 104.70387268066406, + "distillation_loss": 6.444982528686523, + "epoch": 4.31, + "learning_rate": 3.160984314830469e-05, + "loss": 109.25, + "step": 5099, + "task_loss": 4.070639610290527 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9878312127503334, + "compression/movement_sparsity/importance_threshold": -8.522695897268586e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9136412430057602, + "compression/movement_sparsity/model_sparsity": 0.8822548556192933, + "compression_loss": 104.70794677734375, + "distillation_loss": 5.099937438964844, + "epoch": 4.31, + "learning_rate": 3.1605146989762375e-05, + "loss": 108.571, + "step": 5100, + "task_loss": 2.496345043182373 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.987875841509881, + "compression/movement_sparsity/importance_threshold": -8.49143909754874e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9137103077847072, + "compression/movement_sparsity/model_sparsity": 0.8823215478106152, + "compression_loss": 104.7120590209961, + "distillation_loss": 5.626680850982666, + "epoch": 4.31, + "learning_rate": 3.160045083122006e-05, + "loss": 109.3524, + "step": 5101, + "task_loss": 3.508288621902466 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9879203610191538, + "compression/movement_sparsity/importance_threshold": -8.46025881382417e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9136771466745118, + "compression/movement_sparsity/model_sparsity": 0.8822895258865706, + "compression_loss": 104.7160415649414, + "distillation_loss": 3.738895893096924, + "epoch": 4.31, + "learning_rate": 3.1595754672677755e-05, + "loss": 109.3137, + "step": 5102, + "task_loss": 2.1533472537994385 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9879647714120371, + "compression/movement_sparsity/importance_threshold": -8.429154952325352e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9136926719407737, + "compression/movement_sparsity/model_sparsity": 0.8823045178121751, + "compression_loss": 104.72012329101562, + "distillation_loss": 3.4749934673309326, + "epoch": 4.31, + "learning_rate": 3.159105851413544e-05, + "loss": 109.2707, + "step": 5103, + "task_loss": 3.074566602706909 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9880090728224158, + "compression/movement_sparsity/importance_threshold": -8.398127419282676e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.913720562568874, + "compression/movement_sparsity/model_sparsity": 0.8823314503113985, + "compression_loss": 104.72415924072266, + "distillation_loss": 4.365281581878662, + "epoch": 4.31, + "learning_rate": 3.158636235559313e-05, + "loss": 109.637, + "step": 5104, + "task_loss": 2.7593789100646973 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9880532653841753, + "compression/movement_sparsity/importance_threshold": -8.367176120926533e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9137589941611644, + "compression/movement_sparsity/model_sparsity": 0.8823685616602643, + "compression_loss": 104.72821807861328, + "distillation_loss": 3.7468056678771973, + "epoch": 4.32, + "learning_rate": 3.1581666197050814e-05, + "loss": 108.353, + "step": 5105, + "task_loss": 3.0007364749908447 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9880973492312005, + "compression/movement_sparsity/importance_threshold": -8.336300963487312e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9137280867186522, + "compression/movement_sparsity/model_sparsity": 0.8823387159834849, + "compression_loss": 104.73224639892578, + "distillation_loss": 4.145212173461914, + "epoch": 4.32, + "learning_rate": 3.15769700385085e-05, + "loss": 108.5379, + "step": 5106, + "task_loss": 2.364569664001465 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9881413244973768, + "compression/movement_sparsity/importance_threshold": -8.305501853195316e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9137238655633091, + "compression/movement_sparsity/model_sparsity": 0.8823346398378136, + "compression_loss": 104.7362289428711, + "distillation_loss": 3.8106284141540527, + "epoch": 4.32, + "learning_rate": 3.1572273879966193e-05, + "loss": 108.9211, + "step": 5107, + "task_loss": 1.8723922967910767 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9881851913165891, + "compression/movement_sparsity/importance_threshold": -8.274778696281109e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9137836533398354, + "compression/movement_sparsity/model_sparsity": 0.8823923737202874, + "compression_loss": 104.74028015136719, + "distillation_loss": 5.631810665130615, + "epoch": 4.32, + "learning_rate": 3.156757772142387e-05, + "loss": 108.7576, + "step": 5108, + "task_loss": 2.161163568496704 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9882289498227227, + "compression/movement_sparsity/importance_threshold": -8.24413139897508e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9138630682962902, + "compression/movement_sparsity/model_sparsity": 0.8824690605286791, + "compression_loss": 104.74427032470703, + "distillation_loss": 6.678553581237793, + "epoch": 4.32, + "learning_rate": 3.1562881562881566e-05, + "loss": 109.5546, + "step": 5109, + "task_loss": 3.2221322059631348 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9882726001496626, + "compression/movement_sparsity/importance_threshold": -8.213559867507533e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9138481988592483, + "compression/movement_sparsity/model_sparsity": 0.8824547019025433, + "compression_loss": 104.74825286865234, + "distillation_loss": 4.360442161560059, + "epoch": 4.32, + "learning_rate": 3.155818540433925e-05, + "loss": 108.8587, + "step": 5110, + "task_loss": 3.30605149269104 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9883161424312941, + "compression/movement_sparsity/importance_threshold": -8.183064008108771e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9139562079696939, + "compression/movement_sparsity/model_sparsity": 0.8825590005677703, + "compression_loss": 104.75228118896484, + "distillation_loss": 4.932836532592773, + "epoch": 4.32, + "learning_rate": 3.155348924579694e-05, + "loss": 108.5993, + "step": 5111, + "task_loss": 2.753314971923828 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9883595768015022, + "compression/movement_sparsity/importance_threshold": -8.152643727009443e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9139614307551184, + "compression/movement_sparsity/model_sparsity": 0.8825640439344482, + "compression_loss": 104.75624084472656, + "distillation_loss": 5.148598670959473, + "epoch": 4.32, + "learning_rate": 3.1548793087254625e-05, + "loss": 108.8279, + "step": 5112, + "task_loss": 2.9694583415985107 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9884029033941721, + "compression/movement_sparsity/importance_threshold": -8.122298930439767e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9139584735615447, + "compression/movement_sparsity/model_sparsity": 0.8825611883295712, + "compression_loss": 104.76024627685547, + "distillation_loss": 3.138554573059082, + "epoch": 4.32, + "learning_rate": 3.154409692871231e-05, + "loss": 108.4484, + "step": 5113, + "task_loss": 1.0104743242263794 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.988446122343189, + "compression/movement_sparsity/importance_threshold": -8.092029524630218e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9138976722307695, + "compression/movement_sparsity/model_sparsity": 0.8825024757115548, + "compression_loss": 104.76422882080078, + "distillation_loss": 3.8025588989257812, + "epoch": 4.32, + "learning_rate": 3.1539400770170005e-05, + "loss": 109.316, + "step": 5114, + "task_loss": 1.6544033288955688 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9884892337824379, + "compression/movement_sparsity/importance_threshold": -8.061835415811187e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9139661765738375, + "compression/movement_sparsity/model_sparsity": 0.8825686267196945, + "compression_loss": 104.7681884765625, + "distillation_loss": 4.587072372436523, + "epoch": 4.32, + "learning_rate": 3.153470461162769e-05, + "loss": 109.293, + "step": 5115, + "task_loss": 2.6435980796813965 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9885322378458041, + "compression/movement_sparsity/importance_threshold": -8.031716510212977e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9140330115334365, + "compression/movement_sparsity/model_sparsity": 0.8826331656928229, + "compression_loss": 104.77214813232422, + "distillation_loss": 5.269341468811035, + "epoch": 4.32, + "learning_rate": 3.153000845308538e-05, + "loss": 110.1792, + "step": 5116, + "task_loss": 3.031059741973877 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9885751346671726, + "compression/movement_sparsity/importance_threshold": -8.001672714066151e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9139748335195411, + "compression/movement_sparsity/model_sparsity": 0.8825769862726813, + "compression_loss": 104.77610778808594, + "distillation_loss": 4.452831745147705, + "epoch": 4.33, + "learning_rate": 3.1525312294543064e-05, + "loss": 109.2125, + "step": 5117, + "task_loss": 2.4274466037750244 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9886179243804287, + "compression/movement_sparsity/importance_threshold": -7.971703933600926e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9140202169015632, + "compression/movement_sparsity/model_sparsity": 0.8826208105959154, + "compression_loss": 104.78009796142578, + "distillation_loss": 3.2454230785369873, + "epoch": 4.33, + "learning_rate": 3.152061613600075e-05, + "loss": 108.4691, + "step": 5118, + "task_loss": 1.296035885810852 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9886606071194574, + "compression/movement_sparsity/importance_threshold": -7.941810075047951e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9139668920238957, + "compression/movement_sparsity/model_sparsity": 0.8825693175918422, + "compression_loss": 104.7839584350586, + "distillation_loss": 2.4104764461517334, + "epoch": 4.33, + "learning_rate": 3.151591997745844e-05, + "loss": 108.2582, + "step": 5119, + "task_loss": 1.2541050910949707 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9887031830181439, + "compression/movement_sparsity/importance_threshold": -7.911991044637356e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9139534296386348, + "compression/movement_sparsity/model_sparsity": 0.8825563176809301, + "compression_loss": 104.78782653808594, + "distillation_loss": 3.8216357231140137, + "epoch": 4.33, + "learning_rate": 3.151122381891613e-05, + "loss": 108.2746, + "step": 5120, + "task_loss": 2.261786699295044 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9887456522103732, + "compression/movement_sparsity/importance_threshold": -7.882246748599619e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9139611922717658, + "compression/movement_sparsity/model_sparsity": 0.8825638136437324, + "compression_loss": 104.79173278808594, + "distillation_loss": 3.2182445526123047, + "epoch": 4.33, + "learning_rate": 3.1506527660373816e-05, + "loss": 108.3626, + "step": 5121, + "task_loss": 1.618814468383789 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9887880148300308, + "compression/movement_sparsity/importance_threshold": -7.852577093165215e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9139735099369336, + "compression/movement_sparsity/model_sparsity": 0.8825757081592082, + "compression_loss": 104.79562377929688, + "distillation_loss": 4.392571926116943, + "epoch": 4.33, + "learning_rate": 3.15018315018315e-05, + "loss": 109.0453, + "step": 5122, + "task_loss": 3.160836935043335 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9888302710110014, + "compression/movement_sparsity/importance_threshold": -7.822981984564534e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9139826557735102, + "compression/movement_sparsity/model_sparsity": 0.8825845398081625, + "compression_loss": 104.79943084716797, + "distillation_loss": 3.1500296592712402, + "epoch": 4.33, + "learning_rate": 3.149713534328919e-05, + "loss": 108.8964, + "step": 5123, + "task_loss": 2.643526077270508 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9888724208871704, + "compression/movement_sparsity/importance_threshold": -7.79346132902788e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9140068618338111, + "compression/movement_sparsity/model_sparsity": 0.8826079143158254, + "compression_loss": 104.80328369140625, + "distillation_loss": 3.7356560230255127, + "epoch": 4.33, + "learning_rate": 3.149243918474688e-05, + "loss": 109.3757, + "step": 5124, + "task_loss": 2.597242832183838 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9889144645924229, + "compression/movement_sparsity/importance_threshold": -7.764015032785728e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9139743327045005, + "compression/movement_sparsity/model_sparsity": 0.8825765026621779, + "compression_loss": 104.80711364746094, + "distillation_loss": 3.989664077758789, + "epoch": 4.33, + "learning_rate": 3.148774302620456e-05, + "loss": 110.1292, + "step": 5125, + "task_loss": 1.669627070426941 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9889564022606441, + "compression/movement_sparsity/importance_threshold": -7.734643002068383e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9139296528483688, + "compression/movement_sparsity/model_sparsity": 0.8825333576965558, + "compression_loss": 104.8108901977539, + "distillation_loss": 4.546747207641602, + "epoch": 4.33, + "learning_rate": 3.1483046867662254e-05, + "loss": 110.0306, + "step": 5126, + "task_loss": 2.9052999019622803 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.988998234025719, + "compression/movement_sparsity/importance_threshold": -7.705345143106408e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9139597136749789, + "compression/movement_sparsity/model_sparsity": 0.8825623858412939, + "compression_loss": 104.81471252441406, + "distillation_loss": 3.6730563640594482, + "epoch": 4.33, + "learning_rate": 3.147835070911994e-05, + "loss": 110.2601, + "step": 5127, + "task_loss": 2.1810712814331055 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9890399600215328, + "compression/movement_sparsity/importance_threshold": -7.676121362130018e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9139601071725109, + "compression/movement_sparsity/model_sparsity": 0.882562765820975, + "compression_loss": 104.8185043334961, + "distillation_loss": 5.001622200012207, + "epoch": 4.33, + "learning_rate": 3.1473654550577634e-05, + "loss": 109.085, + "step": 5128, + "task_loss": 2.9810686111450195 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9890815803819707, + "compression/movement_sparsity/importance_threshold": -7.646971565369778e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9139924574393069, + "compression/movement_sparsity/model_sparsity": 0.8825940047565857, + "compression_loss": 104.82232666015625, + "distillation_loss": 5.106436729431152, + "epoch": 4.34, + "learning_rate": 3.146895839203532e-05, + "loss": 108.606, + "step": 5129, + "task_loss": 3.0757975578308105 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9891230952409178, + "compression/movement_sparsity/importance_threshold": -7.61789565905599e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9140427774267302, + "compression/movement_sparsity/model_sparsity": 0.8826425960976386, + "compression_loss": 104.82608032226562, + "distillation_loss": 3.6434803009033203, + "epoch": 4.34, + "learning_rate": 3.1464262233493e-05, + "loss": 108.4439, + "step": 5130, + "task_loss": 1.9672739505767822 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9891645047322591, + "compression/movement_sparsity/importance_threshold": -7.588893549419044e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9140798138914072, + "compression/movement_sparsity/model_sparsity": 0.8826783602458165, + "compression_loss": 104.82988739013672, + "distillation_loss": 3.821112632751465, + "epoch": 4.34, + "learning_rate": 3.145956607495069e-05, + "loss": 108.6267, + "step": 5131, + "task_loss": 1.8236448764801025 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9892058089898801, + "compression/movement_sparsity/importance_threshold": -7.559965142689331e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9141396016679335, + "compression/movement_sparsity/model_sparsity": 0.8827360941282902, + "compression_loss": 104.83360290527344, + "distillation_loss": 6.156956672668457, + "epoch": 4.34, + "learning_rate": 3.145486991640838e-05, + "loss": 110.2244, + "step": 5132, + "task_loss": 3.2181928157806396 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9892470081476655, + "compression/movement_sparsity/importance_threshold": -7.531110345097413e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9141438108991089, + "compression/movement_sparsity/model_sparsity": 0.8827401587594258, + "compression_loss": 104.83731079101562, + "distillation_loss": 4.476194381713867, + "epoch": 4.34, + "learning_rate": 3.145017375786607e-05, + "loss": 108.8248, + "step": 5133, + "task_loss": 2.3508927822113037 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9892881023395008, + "compression/movement_sparsity/importance_threshold": -7.50232906287342e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9140853228568551, + "compression/movement_sparsity/model_sparsity": 0.8826836799613536, + "compression_loss": 104.84107208251953, + "distillation_loss": 4.446636199951172, + "epoch": 4.34, + "learning_rate": 3.144547759932375e-05, + "loss": 109.9789, + "step": 5134, + "task_loss": 1.7110819816589355 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.989329091699271, + "compression/movement_sparsity/importance_threshold": -7.473621202247916e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9140806008864713, + "compression/movement_sparsity/model_sparsity": 0.8826791202051789, + "compression_loss": 104.84476470947266, + "distillation_loss": 2.5562925338745117, + "epoch": 4.34, + "learning_rate": 3.1440781440781445e-05, + "loss": 108.2313, + "step": 5135, + "task_loss": 1.7336677312850952 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9893699763608611, + "compression/movement_sparsity/importance_threshold": -7.44498666945129e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.914067639316251, + "compression/movement_sparsity/model_sparsity": 0.8826666039047703, + "compression_loss": 104.84848022460938, + "distillation_loss": 4.606205463409424, + "epoch": 4.34, + "learning_rate": 3.143608528223913e-05, + "loss": 108.8274, + "step": 5136, + "task_loss": 2.0235323905944824 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9894107564581565, + "compression/movement_sparsity/importance_threshold": -7.416425370713933e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9141178519861657, + "compression/movement_sparsity/model_sparsity": 0.8827150916150011, + "compression_loss": 104.85218048095703, + "distillation_loss": 5.713028907775879, + "epoch": 4.34, + "learning_rate": 3.143138912369682e-05, + "loss": 108.8177, + "step": 5137, + "task_loss": 2.088653802871704 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9894514321250422, + "compression/movement_sparsity/importance_threshold": -7.387937212266146e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.914157988734428, + "compression/movement_sparsity/model_sparsity": 0.8827538495424855, + "compression_loss": 104.85587310791016, + "distillation_loss": 4.470807075500488, + "epoch": 4.34, + "learning_rate": 3.1426692965154504e-05, + "loss": 109.1675, + "step": 5138, + "task_loss": 2.808851957321167 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9894920034954032, + "compression/movement_sparsity/importance_threshold": -7.359522100338581e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9141779378668828, + "compression/movement_sparsity/model_sparsity": 0.8827731133608697, + "compression_loss": 104.8595962524414, + "distillation_loss": 3.6586737632751465, + "epoch": 4.34, + "learning_rate": 3.142199680661219e-05, + "loss": 108.6665, + "step": 5139, + "task_loss": 2.053719997406006 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.989532470703125, + "compression/movement_sparsity/importance_threshold": -7.331179941161281e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9142161905966586, + "compression/movement_sparsity/model_sparsity": 0.8828100519916986, + "compression_loss": 104.86319732666016, + "distillation_loss": 3.2243106365203857, + "epoch": 4.34, + "learning_rate": 3.141730064806988e-05, + "loss": 108.8292, + "step": 5140, + "task_loss": 1.8831080198287964 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9895728338820925, + "compression/movement_sparsity/importance_threshold": -7.302910640964895e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9142782916617063, + "compression/movement_sparsity/model_sparsity": 0.8828700196941166, + "compression_loss": 104.86686706542969, + "distillation_loss": 3.329256534576416, + "epoch": 4.35, + "learning_rate": 3.141260448952757e-05, + "loss": 108.6191, + "step": 5141, + "task_loss": 2.3088643550872803 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9896130931661908, + "compression/movement_sparsity/importance_threshold": -7.274714105979727e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.91426172899286, + "compression/movement_sparsity/model_sparsity": 0.882854026003898, + "compression_loss": 104.87046813964844, + "distillation_loss": 4.86141300201416, + "epoch": 4.35, + "learning_rate": 3.1407908330985256e-05, + "loss": 109.3277, + "step": 5142, + "task_loss": 2.388437032699585 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9896532486893052, + "compression/movement_sparsity/importance_threshold": -7.246590242436252e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9142341364689506, + "compression/movement_sparsity/model_sparsity": 0.8828273813680694, + "compression_loss": 104.87413787841797, + "distillation_loss": 4.613224029541016, + "epoch": 4.35, + "learning_rate": 3.140321217244294e-05, + "loss": 108.9939, + "step": 5143, + "task_loss": 3.561182737350464 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9896933005853207, + "compression/movement_sparsity/importance_threshold": -7.218538956564862e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9142767295957459, + "compression/movement_sparsity/model_sparsity": 0.8828685112899275, + "compression_loss": 104.877685546875, + "distillation_loss": 4.259998321533203, + "epoch": 4.35, + "learning_rate": 3.139851601390063e-05, + "loss": 108.591, + "step": 5144, + "task_loss": 2.5158400535583496 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9897332489881225, + "compression/movement_sparsity/importance_threshold": -7.190560154595772e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9143181541541129, + "compression/movement_sparsity/model_sparsity": 0.8829085127872778, + "compression_loss": 104.88125610351562, + "distillation_loss": 4.373218536376953, + "epoch": 4.35, + "learning_rate": 3.139381985535832e-05, + "loss": 108.7652, + "step": 5145, + "task_loss": 2.8389768600463867 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9897730940315957, + "compression/movement_sparsity/importance_threshold": -7.162653742759633e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9143566095947385, + "compression/movement_sparsity/model_sparsity": 0.8829456471652152, + "compression_loss": 104.88475799560547, + "distillation_loss": 5.307328224182129, + "epoch": 4.35, + "learning_rate": 3.138912369681601e-05, + "loss": 109.1314, + "step": 5146, + "task_loss": 2.4835269451141357 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9898128358496256, + "compression/movement_sparsity/importance_threshold": -7.134819627286661e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.914423921521043, + "compression/movement_sparsity/model_sparsity": 0.8830106467197754, + "compression_loss": 104.8883056640625, + "distillation_loss": 3.563798427581787, + "epoch": 4.35, + "learning_rate": 3.1384427538273694e-05, + "loss": 108.8791, + "step": 5147, + "task_loss": 1.530679702758789 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9898524745760973, + "compression/movement_sparsity/importance_threshold": -7.107057714407332e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9144690544955447, + "compression/movement_sparsity/model_sparsity": 0.8830542292377577, + "compression_loss": 104.89183044433594, + "distillation_loss": 2.8608226776123047, + "epoch": 4.35, + "learning_rate": 3.137973137973138e-05, + "loss": 108.3913, + "step": 5148, + "task_loss": 1.624121069908142 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9898920103448957, + "compression/movement_sparsity/importance_threshold": -7.079367910352036e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9144665861928442, + "compression/movement_sparsity/model_sparsity": 0.8830518457288482, + "compression_loss": 104.89539337158203, + "distillation_loss": 3.357325792312622, + "epoch": 4.35, + "learning_rate": 3.137503522118907e-05, + "loss": 108.7136, + "step": 5149, + "task_loss": 1.8940143585205078 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9899314432899061, + "compression/movement_sparsity/importance_threshold": -7.051750121351164e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9145318829348181, + "compression/movement_sparsity/model_sparsity": 0.883114899326859, + "compression_loss": 104.89884948730469, + "distillation_loss": 3.313788890838623, + "epoch": 4.35, + "learning_rate": 3.137033906264676e-05, + "loss": 108.6425, + "step": 5150, + "task_loss": 1.9851895570755005 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9899707735450137, + "compression/movement_sparsity/importance_threshold": -7.024204253635105e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9145992664061283, + "compression/movement_sparsity/model_sparsity": 0.8831799679686341, + "compression_loss": 104.9023666381836, + "distillation_loss": 4.493631362915039, + "epoch": 4.35, + "learning_rate": 3.136564290410444e-05, + "loss": 109.3503, + "step": 5151, + "task_loss": 2.578348159790039 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9900100012441035, + "compression/movement_sparsity/importance_threshold": -6.996730213434336e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9146714195444929, + "compression/movement_sparsity/model_sparsity": 0.8832496424247268, + "compression_loss": 104.90584564208984, + "distillation_loss": 3.491774797439575, + "epoch": 4.35, + "learning_rate": 3.136094674556213e-05, + "loss": 109.0113, + "step": 5152, + "task_loss": 2.686933755874634 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9900491265210609, + "compression/movement_sparsity/importance_threshold": -6.969327906979073e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9147356431113798, + "compression/movement_sparsity/model_sparsity": 0.8833116597145162, + "compression_loss": 104.90933227539062, + "distillation_loss": 4.047588348388672, + "epoch": 4.36, + "learning_rate": 3.135625058701982e-05, + "loss": 108.8499, + "step": 5153, + "task_loss": 2.6929171085357666 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9900881495097708, + "compression/movement_sparsity/importance_threshold": -6.941997240499879e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9148346494752603, + "compression/movement_sparsity/model_sparsity": 0.8834072649052183, + "compression_loss": 104.91276550292969, + "distillation_loss": 3.424588918685913, + "epoch": 4.36, + "learning_rate": 3.1351554428477506e-05, + "loss": 108.1409, + "step": 5154, + "task_loss": 2.34260892868042 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9901270703441183, + "compression/movement_sparsity/importance_threshold": -6.914738120227144e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9148297247940267, + "compression/movement_sparsity/model_sparsity": 0.8834025094019351, + "compression_loss": 104.91621398925781, + "distillation_loss": 3.7153120040893555, + "epoch": 4.36, + "learning_rate": 3.134685826993519e-05, + "loss": 109.1966, + "step": 5155, + "task_loss": 2.24242901802063 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9901658891579888, + "compression/movement_sparsity/importance_threshold": -6.887550452391172e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9148127805518161, + "compression/movement_sparsity/model_sparsity": 0.883386147246571, + "compression_loss": 104.91961669921875, + "distillation_loss": 3.4880757331848145, + "epoch": 4.36, + "learning_rate": 3.134216211139288e-05, + "loss": 109.0609, + "step": 5156, + "task_loss": 1.8820438385009766 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9902046060852672, + "compression/movement_sparsity/importance_threshold": -6.86043414322244e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9147581797882115, + "compression/movement_sparsity/model_sparsity": 0.8833334221871679, + "compression_loss": 104.92298889160156, + "distillation_loss": 3.808260440826416, + "epoch": 4.36, + "learning_rate": 3.133746595285057e-05, + "loss": 109.3731, + "step": 5157, + "task_loss": 2.200307846069336 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9902432212598387, + "compression/movement_sparsity/importance_threshold": -6.833389098951335e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9147513710884915, + "compression/movement_sparsity/model_sparsity": 0.8833268473872292, + "compression_loss": 104.92642974853516, + "distillation_loss": 3.0494039058685303, + "epoch": 4.36, + "learning_rate": 3.133276979430826e-05, + "loss": 108.7687, + "step": 5158, + "task_loss": 1.3738709688186646 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9902817348155885, + "compression/movement_sparsity/importance_threshold": -6.80641522580825e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9147351899930096, + "compression/movement_sparsity/model_sparsity": 0.883311222162156, + "compression_loss": 104.92986297607422, + "distillation_loss": 3.6871981620788574, + "epoch": 4.36, + "learning_rate": 3.132807363576595e-05, + "loss": 109.1436, + "step": 5159, + "task_loss": 2.0332021713256836 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9903201468864018, + "compression/movement_sparsity/importance_threshold": -6.779512430023487e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9148335524518378, + "compression/movement_sparsity/model_sparsity": 0.8834062055679252, + "compression_loss": 104.93321990966797, + "distillation_loss": 3.729799747467041, + "epoch": 4.36, + "learning_rate": 3.132337747722363e-05, + "loss": 108.651, + "step": 5160, + "task_loss": 2.0735132694244385 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9903584576061635, + "compression/movement_sparsity/importance_threshold": -6.752680617827609e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9148377855313485, + "compression/movement_sparsity/model_sparsity": 0.8834102932281322, + "compression_loss": 104.9365463256836, + "distillation_loss": 4.633014678955078, + "epoch": 4.36, + "learning_rate": 3.131868131868132e-05, + "loss": 109.3911, + "step": 5161, + "task_loss": 2.3478262424468994 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.990396667108759, + "compression/movement_sparsity/importance_threshold": -6.72591969545092e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9148527384375639, + "compression/movement_sparsity/model_sparsity": 0.8834247324560186, + "compression_loss": 104.93992614746094, + "distillation_loss": 5.938641548156738, + "epoch": 4.36, + "learning_rate": 3.131398516013901e-05, + "loss": 109.9306, + "step": 5162, + "task_loss": 2.2888729572296143 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9904347755280732, + "compression/movement_sparsity/importance_threshold": -6.699229569123808e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9149201457572095, + "compression/movement_sparsity/model_sparsity": 0.8834898241268652, + "compression_loss": 104.94325256347656, + "distillation_loss": 4.739206314086914, + "epoch": 4.36, + "learning_rate": 3.1309289001596696e-05, + "loss": 109.4739, + "step": 5163, + "task_loss": 2.029761552810669 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9904727829979916, + "compression/movement_sparsity/importance_threshold": -6.672610145076665e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9149523052373234, + "compression/movement_sparsity/model_sparsity": 0.883520878829903, + "compression_loss": 104.94660949707031, + "distillation_loss": 5.401961326599121, + "epoch": 4.36, + "learning_rate": 3.130459284305438e-05, + "loss": 110.1893, + "step": 5164, + "task_loss": 3.1372413635253906 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.990510689652399, + "compression/movement_sparsity/importance_threshold": -6.64606132953988e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9150199271919863, + "compression/movement_sparsity/model_sparsity": 0.8835861777623939, + "compression_loss": 104.94989776611328, + "distillation_loss": 4.2096662521362305, + "epoch": 4.37, + "learning_rate": 3.129989668451207e-05, + "loss": 108.9104, + "step": 5165, + "task_loss": 1.871734380722046 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9905484956251805, + "compression/movement_sparsity/importance_threshold": -6.619583028744016e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9150929746429236, + "compression/movement_sparsity/model_sparsity": 0.8836567158086712, + "compression_loss": 104.95315551757812, + "distillation_loss": 4.093985557556152, + "epoch": 4.37, + "learning_rate": 3.129520052596976e-05, + "loss": 109.2454, + "step": 5166, + "task_loss": 2.859217405319214 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9905862010502215, + "compression/movement_sparsity/importance_threshold": -6.59317514891929e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9151467407147937, + "compression/movement_sparsity/model_sparsity": 0.8837086348505688, + "compression_loss": 104.95645904541016, + "distillation_loss": 5.142999649047852, + "epoch": 4.37, + "learning_rate": 3.129050436742745e-05, + "loss": 109.0465, + "step": 5167, + "task_loss": 3.2797698974609375 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9906238060614072, + "compression/movement_sparsity/importance_threshold": -6.566837596296091e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9151496621358646, + "compression/movement_sparsity/model_sparsity": 0.8837114559118384, + "compression_loss": 104.9597396850586, + "distillation_loss": 3.6974377632141113, + "epoch": 4.37, + "learning_rate": 3.128580820888513e-05, + "loss": 108.6989, + "step": 5168, + "task_loss": 1.7171962261199951 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9906613107926224, + "compression/movement_sparsity/importance_threshold": -6.540570277104897e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9151798779766538, + "compression/movement_sparsity/model_sparsity": 0.8837406337455418, + "compression_loss": 104.96306610107422, + "distillation_loss": 4.341638088226318, + "epoch": 4.37, + "learning_rate": 3.128111205034282e-05, + "loss": 109.1302, + "step": 5169, + "task_loss": 2.4382357597351074 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9906987153777524, + "compression/movement_sparsity/importance_threshold": -6.514373097576184e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.915240190416556, + "compression/movement_sparsity/model_sparsity": 0.8837988742675906, + "compression_loss": 104.96639251708984, + "distillation_loss": 5.1464691162109375, + "epoch": 4.37, + "learning_rate": 3.127641589180051e-05, + "loss": 109.8384, + "step": 5170, + "task_loss": 2.981036901473999 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9907360199506825, + "compression/movement_sparsity/importance_threshold": -6.488245963940167e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9152430879892915, + "compression/movement_sparsity/model_sparsity": 0.8838016722997887, + "compression_loss": 104.96966552734375, + "distillation_loss": 5.598273277282715, + "epoch": 4.37, + "learning_rate": 3.12717197332582e-05, + "loss": 109.2275, + "step": 5171, + "task_loss": 3.0510916709899902 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9907732246452976, + "compression/movement_sparsity/importance_threshold": -6.462188782427412e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9153133332608344, + "compression/movement_sparsity/model_sparsity": 0.8838695044301543, + "compression_loss": 104.97297668457031, + "distillation_loss": 4.7313032150268555, + "epoch": 4.37, + "learning_rate": 3.126702357471588e-05, + "loss": 109.1807, + "step": 5172, + "task_loss": 2.4155080318450928 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.990810329595483, + "compression/movement_sparsity/importance_threshold": -6.436201459268133e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9153414027514493, + "compression/movement_sparsity/model_sparsity": 0.8838966096474147, + "compression_loss": 104.97627258300781, + "distillation_loss": 5.187762260437012, + "epoch": 4.37, + "learning_rate": 3.126232741617357e-05, + "loss": 109.5827, + "step": 5173, + "task_loss": 3.240830659866333 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9908473349351238, + "compression/movement_sparsity/importance_threshold": -6.410283900692808e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9153681367352888, + "compression/movement_sparsity/model_sparsity": 0.883922425236666, + "compression_loss": 104.97953033447266, + "distillation_loss": 4.195384979248047, + "epoch": 4.37, + "learning_rate": 3.125763125763126e-05, + "loss": 109.0365, + "step": 5174, + "task_loss": 2.2656729221343994 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.990884240798105, + "compression/movement_sparsity/importance_threshold": -6.384436012931913e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9153929986248096, + "compression/movement_sparsity/model_sparsity": 0.8839464330437976, + "compression_loss": 104.98286437988281, + "distillation_loss": 3.8446359634399414, + "epoch": 4.37, + "learning_rate": 3.1252935099088946e-05, + "loss": 108.9258, + "step": 5175, + "task_loss": 2.444943904876709 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9909210473183119, + "compression/movement_sparsity/importance_threshold": -6.358657702215838e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9154103005920492, + "compression/movement_sparsity/model_sparsity": 0.8839631406352355, + "compression_loss": 104.98612976074219, + "distillation_loss": 5.214948654174805, + "epoch": 4.38, + "learning_rate": 3.124823894054664e-05, + "loss": 109.741, + "step": 5176, + "task_loss": 3.891021966934204 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9909577546296297, + "compression/movement_sparsity/importance_threshold": -6.332948874774886e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9154084165735628, + "compression/movement_sparsity/model_sparsity": 0.88396132133858, + "compression_loss": 104.98937225341797, + "distillation_loss": 5.145688533782959, + "epoch": 4.38, + "learning_rate": 3.124354278200432e-05, + "loss": 109.879, + "step": 5177, + "task_loss": 2.7917745113372803 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9909943628659433, + "compression/movement_sparsity/importance_threshold": -6.307309436839447e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9154349239982172, + "compression/movement_sparsity/model_sparsity": 0.8839869181516512, + "compression_loss": 104.99263000488281, + "distillation_loss": 4.5916900634765625, + "epoch": 4.38, + "learning_rate": 3.123884662346201e-05, + "loss": 109.2796, + "step": 5178, + "task_loss": 3.4848484992980957 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.991030872161138, + "compression/movement_sparsity/importance_threshold": -6.281739294640085e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9154578422484134, + "compression/movement_sparsity/model_sparsity": 0.8840090490894483, + "compression_loss": 104.9958724975586, + "distillation_loss": 4.382300853729248, + "epoch": 4.38, + "learning_rate": 3.12341504649197e-05, + "loss": 109.7403, + "step": 5179, + "task_loss": 3.206920623779297 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.991067282649099, + "compression/movement_sparsity/importance_threshold": -6.256238354407015e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9155544160820963, + "compression/movement_sparsity/model_sparsity": 0.8841023053148483, + "compression_loss": 104.99909210205078, + "distillation_loss": 4.8595781326293945, + "epoch": 4.38, + "learning_rate": 3.1229454306377384e-05, + "loss": 109.1354, + "step": 5180, + "task_loss": 3.020352840423584 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9911035944637113, + "compression/movement_sparsity/importance_threshold": -6.230806522370715e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9156044498894964, + "compression/movement_sparsity/model_sparsity": 0.8841506203070423, + "compression_loss": 105.00225067138672, + "distillation_loss": 5.12705135345459, + "epoch": 4.38, + "learning_rate": 3.122475814783507e-05, + "loss": 108.9351, + "step": 5181, + "task_loss": 2.468276023864746 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9911398077388601, + "compression/movement_sparsity/importance_threshold": -6.20544370476166e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9156905423798273, + "compression/movement_sparsity/model_sparsity": 0.8842337552554789, + "compression_loss": 105.0054931640625, + "distillation_loss": 3.2620387077331543, + "epoch": 4.38, + "learning_rate": 3.122006198929276e-05, + "loss": 109.459, + "step": 5182, + "task_loss": 2.3089542388916016 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9911759226084307, + "compression/movement_sparsity/importance_threshold": -6.180149807809981e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9157700646537908, + "compression/movement_sparsity/model_sparsity": 0.8843105456946926, + "compression_loss": 105.00869750976562, + "distillation_loss": 3.813184976577759, + "epoch": 4.38, + "learning_rate": 3.121536583075045e-05, + "loss": 108.9346, + "step": 5183, + "task_loss": 2.378330707550049 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9912119392063079, + "compression/movement_sparsity/importance_threshold": -6.154924737746415e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9157795205187261, + "compression/movement_sparsity/model_sparsity": 0.8843196767215776, + "compression_loss": 105.01184844970703, + "distillation_loss": 3.9441606998443604, + "epoch": 4.38, + "learning_rate": 3.1210669672208136e-05, + "loss": 109.1843, + "step": 5184, + "task_loss": 2.789583444595337 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9912478576663771, + "compression/movement_sparsity/importance_threshold": -6.129768400801091e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9157608234238731, + "compression/movement_sparsity/model_sparsity": 0.8843016219294518, + "compression_loss": 105.01504516601562, + "distillation_loss": 4.102852821350098, + "epoch": 4.38, + "learning_rate": 3.120597351366582e-05, + "loss": 109.4644, + "step": 5185, + "task_loss": 2.080470323562622 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9912836781225234, + "compression/movement_sparsity/importance_threshold": -6.104680703204659e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9158012821246615, + "compression/movement_sparsity/model_sparsity": 0.8843406907494027, + "compression_loss": 105.01820373535156, + "distillation_loss": 2.7837815284729004, + "epoch": 4.38, + "learning_rate": 3.120127735512351e-05, + "loss": 109.0451, + "step": 5186, + "task_loss": 2.034298896789551 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9913194007086319, + "compression/movement_sparsity/importance_threshold": -6.079661551187249e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.915877990295063, + "compression/movement_sparsity/model_sparsity": 0.884414763758169, + "compression_loss": 105.02132415771484, + "distillation_loss": 5.289899826049805, + "epoch": 4.38, + "learning_rate": 3.1196581196581195e-05, + "loss": 109.0506, + "step": 5187, + "task_loss": 2.0994086265563965 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9913550255585877, + "compression/movement_sparsity/importance_threshold": -6.054710850979424e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9159320783194593, + "compression/movement_sparsity/model_sparsity": 0.884466993692533, + "compression_loss": 105.02450561523438, + "distillation_loss": 2.5548782348632812, + "epoch": 4.39, + "learning_rate": 3.119188503803889e-05, + "loss": 108.5132, + "step": 5188, + "task_loss": 1.4657349586486816 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.991390552806276, + "compression/movement_sparsity/importance_threshold": -6.029828508811574e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.916018111188952, + "compression/movement_sparsity/model_sparsity": 0.8845500710682906, + "compression_loss": 105.0275650024414, + "distillation_loss": 2.1204538345336914, + "epoch": 4.39, + "learning_rate": 3.118718887949657e-05, + "loss": 108.6126, + "step": 5189, + "task_loss": 2.1900856494903564 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9914259825855819, + "compression/movement_sparsity/importance_threshold": -6.005014430914002e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9160028721027135, + "compression/movement_sparsity/model_sparsity": 0.8845353554915453, + "compression_loss": 105.0306396484375, + "distillation_loss": 5.179567337036133, + "epoch": 4.39, + "learning_rate": 3.118249272095426e-05, + "loss": 109.7562, + "step": 5190, + "task_loss": 2.449612855911255 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9914613150303906, + "compression/movement_sparsity/importance_threshold": -5.980268523517185e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9159807408475813, + "compression/movement_sparsity/model_sparsity": 0.8845139845131106, + "compression_loss": 105.03373718261719, + "distillation_loss": 3.9097352027893066, + "epoch": 4.39, + "learning_rate": 3.117779656241195e-05, + "loss": 108.818, + "step": 5191, + "task_loss": 1.9897174835205078 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9914965502745872, + "compression/movement_sparsity/importance_threshold": -5.955590692851599e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9159594085116807, + "compression/movement_sparsity/model_sparsity": 0.8844933850085741, + "compression_loss": 105.036865234375, + "distillation_loss": 5.177759170532227, + "epoch": 4.39, + "learning_rate": 3.1173100403869634e-05, + "loss": 109.3945, + "step": 5192, + "task_loss": 3.0190560817718506 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9915316884520569, + "compression/movement_sparsity/importance_threshold": -5.930980845147461e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9160386326814534, + "compression/movement_sparsity/model_sparsity": 0.884569887584393, + "compression_loss": 105.03993225097656, + "distillation_loss": 4.347969055175781, + "epoch": 4.39, + "learning_rate": 3.116840424532733e-05, + "loss": 108.6545, + "step": 5193, + "task_loss": 2.2191271781921387 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9915667296966847, + "compression/movement_sparsity/importance_threshold": -5.906438886635248e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9160528939859459, + "compression/movement_sparsity/model_sparsity": 0.8845836589692033, + "compression_loss": 105.04303741455078, + "distillation_loss": 5.541364669799805, + "epoch": 4.39, + "learning_rate": 3.116370808678501e-05, + "loss": 109.5767, + "step": 5194, + "task_loss": 2.881110906600952 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9916016741423559, + "compression/movement_sparsity/importance_threshold": -5.8819647235453486e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9160739043693202, + "compression/movement_sparsity/model_sparsity": 0.8846039475812733, + "compression_loss": 105.04610443115234, + "distillation_loss": 3.384204387664795, + "epoch": 4.39, + "learning_rate": 3.11590119282427e-05, + "loss": 108.9597, + "step": 5195, + "task_loss": 1.069176197052002 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9916365219229556, + "compression/movement_sparsity/importance_threshold": -5.85755826210824e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9160279963239222, + "compression/movement_sparsity/model_sparsity": 0.8845596166184643, + "compression_loss": 105.04917907714844, + "distillation_loss": 4.428487300872803, + "epoch": 4.39, + "learning_rate": 3.1154315769700386e-05, + "loss": 109.1245, + "step": 5196, + "task_loss": 2.1247103214263916 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9916712731723689, + "compression/movement_sparsity/importance_threshold": -5.833219408554226e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9160003441791746, + "compression/movement_sparsity/model_sparsity": 0.8845329144099567, + "compression_loss": 105.05216979980469, + "distillation_loss": 6.729247570037842, + "epoch": 4.39, + "learning_rate": 3.114961961115808e-05, + "loss": 109.5855, + "step": 5197, + "task_loss": 3.328348398208618 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9917059280244809, + "compression/movement_sparsity/importance_threshold": -5.808948069113782e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.916006652063854, + "compression/movement_sparsity/model_sparsity": 0.8845390055993921, + "compression_loss": 105.05521392822266, + "distillation_loss": 2.5887954235076904, + "epoch": 4.39, + "learning_rate": 3.114492345261576e-05, + "loss": 109.094, + "step": 5198, + "task_loss": 1.767388105392456 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9917404866131768, + "compression/movement_sparsity/importance_threshold": -5.784744150017212e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.91606795420967, + "compression/movement_sparsity/model_sparsity": 0.8845982018279118, + "compression_loss": 105.05825805664062, + "distillation_loss": 4.55316686630249, + "epoch": 4.39, + "learning_rate": 3.114022729407345e-05, + "loss": 109.1969, + "step": 5199, + "task_loss": 3.254136562347412 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9917749490723418, + "compression/movement_sparsity/importance_threshold": -5.760607557494992e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9161305560897582, + "compression/movement_sparsity/model_sparsity": 0.8846586531408331, + "compression_loss": 105.06127166748047, + "distillation_loss": 3.6371212005615234, + "epoch": 4.4, + "learning_rate": 3.113553113553114e-05, + "loss": 109.2471, + "step": 5200, + "task_loss": 1.6408169269561768 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9918093155358609, + "compression/movement_sparsity/importance_threshold": -5.736538197777426e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9161105592606329, + "compression/movement_sparsity/model_sparsity": 0.8846393432643057, + "compression_loss": 105.06424713134766, + "distillation_loss": 6.0265398025512695, + "epoch": 4.4, + "learning_rate": 3.1130834976988825e-05, + "loss": 109.7745, + "step": 5201, + "task_loss": 3.2487926483154297 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9918435861376194, + "compression/movement_sparsity/importance_threshold": -5.712535977095076e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9161418602006771, + "compression/movement_sparsity/model_sparsity": 0.8846695689207664, + "compression_loss": 105.0672836303711, + "distillation_loss": 4.190788269042969, + "epoch": 4.4, + "learning_rate": 3.112613881844651e-05, + "loss": 109.7062, + "step": 5202, + "task_loss": 1.9235881567001343 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9918777610115023, + "compression/movement_sparsity/importance_threshold": -5.68860080167816e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.916117463353694, + "compression/movement_sparsity/model_sparsity": 0.8846460101805308, + "compression_loss": 105.0703125, + "distillation_loss": 5.201738357543945, + "epoch": 4.4, + "learning_rate": 3.11214426599042e-05, + "loss": 108.7223, + "step": 5203, + "task_loss": 2.6606781482696533 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9919118402913948, + "compression/movement_sparsity/importance_threshold": -5.664732577757153e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9161768099360178, + "compression/movement_sparsity/model_sparsity": 0.8847033180251802, + "compression_loss": 105.07328033447266, + "distillation_loss": 4.3482561111450195, + "epoch": 4.4, + "learning_rate": 3.111674650136189e-05, + "loss": 108.881, + "step": 5204, + "task_loss": 2.82692551612854 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.991945824111182, + "compression/movement_sparsity/importance_threshold": -5.640931211562533e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9161644803466823, + "compression/movement_sparsity/model_sparsity": 0.8846914119951687, + "compression_loss": 105.07627868652344, + "distillation_loss": 4.290942668914795, + "epoch": 4.4, + "learning_rate": 3.111205034281958e-05, + "loss": 109.1719, + "step": 5205, + "task_loss": 2.9466912746429443 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9919797126047492, + "compression/movement_sparsity/importance_threshold": -5.617196609324516e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9162554617457439, + "compression/movement_sparsity/model_sparsity": 0.884779267903281, + "compression_loss": 105.07929229736328, + "distillation_loss": 4.758189678192139, + "epoch": 4.4, + "learning_rate": 3.110735418427726e-05, + "loss": 110.1907, + "step": 5206, + "task_loss": 2.4261672496795654 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9920135059059813, + "compression/movement_sparsity/importance_threshold": -5.5935286772737514e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9162495354344289, + "compression/movement_sparsity/model_sparsity": 0.8847735451789911, + "compression_loss": 105.08228302001953, + "distillation_loss": 4.856335639953613, + "epoch": 4.4, + "learning_rate": 3.110265802573495e-05, + "loss": 109.4997, + "step": 5207, + "task_loss": 2.2679269313812256 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9920472041487636, + "compression/movement_sparsity/importance_threshold": -5.5699273216403694e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9162789404318189, + "compression/movement_sparsity/model_sparsity": 0.8848019400242605, + "compression_loss": 105.08528137207031, + "distillation_loss": 4.781373500823975, + "epoch": 4.4, + "learning_rate": 3.1097961867192636e-05, + "loss": 109.3018, + "step": 5208, + "task_loss": 3.045623779296875 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9920808074669812, + "compression/movement_sparsity/importance_threshold": -5.5463924486548466e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9162843659280933, + "compression/movement_sparsity/model_sparsity": 0.884807179138047, + "compression_loss": 105.08821868896484, + "distillation_loss": 4.957886695861816, + "epoch": 4.4, + "learning_rate": 3.109326570865033e-05, + "loss": 109.6677, + "step": 5209, + "task_loss": 2.168710708618164 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9921143159945193, + "compression/movement_sparsity/importance_threshold": -5.522923964547746e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9162806336636232, + "compression/movement_sparsity/model_sparsity": 0.8848035750883433, + "compression_loss": 105.09123229980469, + "distillation_loss": 3.7046523094177246, + "epoch": 4.4, + "learning_rate": 3.1088569550108015e-05, + "loss": 109.2973, + "step": 5210, + "task_loss": 2.2956881523132324 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9921477298652629, + "compression/movement_sparsity/importance_threshold": -5.499521775549285e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.916322392098684, + "compression/movement_sparsity/model_sparsity": 0.8848438989926958, + "compression_loss": 105.09418487548828, + "distillation_loss": 4.785365581512451, + "epoch": 4.4, + "learning_rate": 3.10838733915657e-05, + "loss": 108.9035, + "step": 5211, + "task_loss": 1.9259170293807983 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9921810492130972, + "compression/movement_sparsity/importance_threshold": -5.476185787889939e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9163803197050591, + "compression/movement_sparsity/model_sparsity": 0.8848998366075856, + "compression_loss": 105.09716796875, + "distillation_loss": 3.4944117069244385, + "epoch": 4.41, + "learning_rate": 3.107917723302339e-05, + "loss": 109.2491, + "step": 5212, + "task_loss": 2.2266061305999756 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9922142741719074, + "compression/movement_sparsity/importance_threshold": -5.452915907800098e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9164475123896871, + "compression/movement_sparsity/model_sparsity": 0.8849647210167879, + "compression_loss": 105.10005950927734, + "distillation_loss": 4.810935020446777, + "epoch": 4.41, + "learning_rate": 3.1074481074481074e-05, + "loss": 109.4236, + "step": 5213, + "task_loss": 1.8845292329788208 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9922474048755787, + "compression/movement_sparsity/importance_threshold": -5.429712041510066e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9164710387724327, + "compression/movement_sparsity/model_sparsity": 0.8849874391959105, + "compression_loss": 105.10299682617188, + "distillation_loss": 5.648877143859863, + "epoch": 4.41, + "learning_rate": 3.106978491593877e-05, + "loss": 108.8446, + "step": 5214, + "task_loss": 3.403153419494629 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9922804414579961, + "compression/movement_sparsity/importance_threshold": -5.4065740952504056e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9165145739084712, + "compression/movement_sparsity/model_sparsity": 0.8850294787660964, + "compression_loss": 105.10582733154297, + "distillation_loss": 4.363511085510254, + "epoch": 4.41, + "learning_rate": 3.106508875739645e-05, + "loss": 109.3709, + "step": 5215, + "task_loss": 3.137528896331787 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9923133840530447, + "compression/movement_sparsity/importance_threshold": -5.38350197525142e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9164697509623281, + "compression/movement_sparsity/model_sparsity": 0.8849861956260446, + "compression_loss": 105.10870361328125, + "distillation_loss": 4.313808441162109, + "epoch": 4.41, + "learning_rate": 3.106039259885414e-05, + "loss": 109.1564, + "step": 5216, + "task_loss": 1.8479979038238525 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9923462327946098, + "compression/movement_sparsity/importance_threshold": -5.360495587743586e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9164773943537826, + "compression/movement_sparsity/model_sparsity": 0.884993576443489, + "compression_loss": 105.11153411865234, + "distillation_loss": 5.190854072570801, + "epoch": 4.41, + "learning_rate": 3.1055696440311826e-05, + "loss": 109.4673, + "step": 5217, + "task_loss": 3.533594846725464 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9923789878165764, + "compression/movement_sparsity/importance_threshold": -5.3375548389571204e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9164991917322209, + "compression/movement_sparsity/model_sparsity": 0.8850146250149215, + "compression_loss": 105.1143798828125, + "distillation_loss": 4.179282188415527, + "epoch": 4.41, + "learning_rate": 3.105100028176951e-05, + "loss": 109.0415, + "step": 5218, + "task_loss": 3.01669979095459 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9924116492528298, + "compression/movement_sparsity/importance_threshold": -5.314679635122586e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9165469480236026, + "compression/movement_sparsity/model_sparsity": 0.8850607407307786, + "compression_loss": 105.11717224121094, + "distillation_loss": 3.131842851638794, + "epoch": 4.41, + "learning_rate": 3.10463041232272e-05, + "loss": 108.5636, + "step": 5219, + "task_loss": 2.3780667781829834 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.992444217237255, + "compression/movement_sparsity/importance_threshold": -5.291869882470372e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9165299322363862, + "compression/movement_sparsity/model_sparsity": 0.8850443094881997, + "compression_loss": 105.11996459960938, + "distillation_loss": 5.197847366333008, + "epoch": 4.41, + "learning_rate": 3.1041607964684885e-05, + "loss": 109.2749, + "step": 5220, + "task_loss": 2.126046657562256 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9924766919037372, + "compression/movement_sparsity/importance_threshold": -5.269125487230696e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9165645242466979, + "compression/movement_sparsity/model_sparsity": 0.8850777131565397, + "compression_loss": 105.12274169921875, + "distillation_loss": 4.998417377471924, + "epoch": 4.41, + "learning_rate": 3.103691180614258e-05, + "loss": 109.4562, + "step": 5221, + "task_loss": 3.4918718338012695 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9925090733861615, + "compression/movement_sparsity/importance_threshold": -5.246446355634208e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9165749459692116, + "compression/movement_sparsity/model_sparsity": 0.8850877768608241, + "compression_loss": 105.12553405761719, + "distillation_loss": 4.365262031555176, + "epoch": 4.41, + "learning_rate": 3.1032215647600265e-05, + "loss": 108.9588, + "step": 5222, + "task_loss": 2.1591427326202393 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9925413618184131, + "compression/movement_sparsity/importance_threshold": -5.2238323939111236e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9165188546846526, + "compression/movement_sparsity/model_sparsity": 0.8850336124844466, + "compression_loss": 105.1282958984375, + "distillation_loss": 3.4839558601379395, + "epoch": 4.41, + "learning_rate": 3.102751948905796e-05, + "loss": 109.4647, + "step": 5223, + "task_loss": 1.432780146598816 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9925735573343771, + "compression/movement_sparsity/importance_threshold": -5.20128350829192e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9165048080151774, + "compression/movement_sparsity/model_sparsity": 0.8850200483612807, + "compression_loss": 105.13108825683594, + "distillation_loss": 2.9023187160491943, + "epoch": 4.42, + "learning_rate": 3.102282333051564e-05, + "loss": 108.5372, + "step": 5224, + "task_loss": 1.7747489213943481 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9926056600679386, + "compression/movement_sparsity/importance_threshold": -5.1787996050070735e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9165949189500017, + "compression/movement_sparsity/model_sparsity": 0.88510706370828, + "compression_loss": 105.13382720947266, + "distillation_loss": 4.6007585525512695, + "epoch": 4.42, + "learning_rate": 3.1018127171973324e-05, + "loss": 109.1657, + "step": 5225, + "task_loss": 2.361626148223877 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9926376701529829, + "compression/movement_sparsity/importance_threshold": -5.156380590286801e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9166634829139079, + "compression/movement_sparsity/model_sparsity": 0.8851732722890986, + "compression_loss": 105.13655090332031, + "distillation_loss": 3.852369546890259, + "epoch": 4.42, + "learning_rate": 3.101343101343102e-05, + "loss": 108.9645, + "step": 5226, + "task_loss": 2.1826553344726562 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.992669587723395, + "compression/movement_sparsity/importance_threshold": -5.134026370361665e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9166910158169791, + "compression/movement_sparsity/model_sparsity": 0.8851998593522482, + "compression_loss": 105.13926696777344, + "distillation_loss": 4.546185493469238, + "epoch": 4.42, + "learning_rate": 3.10087348548887e-05, + "loss": 108.9962, + "step": 5227, + "task_loss": 2.1144495010375977 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.99270141291306, + "compression/movement_sparsity/importance_threshold": -5.111736851461882e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.916755728274739, + "compression/movement_sparsity/model_sparsity": 0.8852623487380051, + "compression_loss": 105.14201354980469, + "distillation_loss": 2.830380439758301, + "epoch": 4.42, + "learning_rate": 3.100403869634639e-05, + "loss": 109.0362, + "step": 5228, + "task_loss": 1.2092900276184082 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9927331458558631, + "compression/movement_sparsity/importance_threshold": -5.089511939818103e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9167049193964425, + "compression/movement_sparsity/model_sparsity": 0.8852132853009846, + "compression_loss": 105.14472198486328, + "distillation_loss": 4.088048458099365, + "epoch": 4.42, + "learning_rate": 3.0999342537804076e-05, + "loss": 108.6941, + "step": 5229, + "task_loss": 3.030275821685791 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9927647866856896, + "compression/movement_sparsity/importance_threshold": -5.067351541660457e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.91675077974517, + "compression/movement_sparsity/model_sparsity": 0.8852575702056504, + "compression_loss": 105.14742279052734, + "distillation_loss": 4.595587730407715, + "epoch": 4.42, + "learning_rate": 3.099464637926177e-05, + "loss": 109.3805, + "step": 5230, + "task_loss": 1.858496069908142 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9927963355364243, + "compression/movement_sparsity/importance_threshold": -5.0452555632195945e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.916702021823707, + "compression/movement_sparsity/model_sparsity": 0.8852104872687865, + "compression_loss": 105.15009307861328, + "distillation_loss": 3.731786012649536, + "epoch": 4.42, + "learning_rate": 3.0989950220719455e-05, + "loss": 109.4625, + "step": 5231, + "task_loss": 2.152480363845825 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9928277925419527, + "compression/movement_sparsity/importance_threshold": -5.0232239107256445e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.916727956888315, + "compression/movement_sparsity/model_sparsity": 0.8852355313841397, + "compression_loss": 105.15281677246094, + "distillation_loss": 3.6329870223999023, + "epoch": 4.42, + "learning_rate": 3.0985254062177135e-05, + "loss": 108.9785, + "step": 5232, + "task_loss": 1.7407716512680054 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9928591578361597, + "compression/movement_sparsity/importance_threshold": -5.001256490409171e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.916751864844425, + "compression/movement_sparsity/model_sparsity": 0.8852586180284078, + "compression_loss": 105.15546417236328, + "distillation_loss": 4.558316230773926, + "epoch": 4.42, + "learning_rate": 3.098055790363483e-05, + "loss": 108.7487, + "step": 5233, + "task_loss": 2.8748414516448975 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9928904315529306, + "compression/movement_sparsity/importance_threshold": -4.9793532085006496e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9167828557561105, + "compression/movement_sparsity/model_sparsity": 0.8852885443069377, + "compression_loss": 105.15814971923828, + "distillation_loss": 3.4313440322875977, + "epoch": 4.42, + "learning_rate": 3.0975861745092514e-05, + "loss": 109.0161, + "step": 5234, + "task_loss": 2.4024136066436768 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9929216138261504, + "compression/movement_sparsity/importance_threshold": -4.957513971230211e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9168376711547326, + "compression/movement_sparsity/model_sparsity": 0.8853414766279852, + "compression_loss": 105.1607666015625, + "distillation_loss": 4.455592632293701, + "epoch": 4.42, + "learning_rate": 3.097116558655021e-05, + "loss": 109.0204, + "step": 5235, + "task_loss": 2.1069235801696777 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9929527047897043, + "compression/movement_sparsity/importance_threshold": -4.935738684828505e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9168179843539658, + "compression/movement_sparsity/model_sparsity": 0.8853224661293885, + "compression_loss": 105.16339874267578, + "distillation_loss": 3.966323137283325, + "epoch": 4.43, + "learning_rate": 3.0966469428007894e-05, + "loss": 109.1315, + "step": 5236, + "task_loss": 2.7347984313964844 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9929837045774774, + "compression/movement_sparsity/importance_threshold": -4.9140272555258345e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9168137512744551, + "compression/movement_sparsity/model_sparsity": 0.8853183784691814, + "compression_loss": 105.1659927368164, + "distillation_loss": 5.205983638763428, + "epoch": 4.43, + "learning_rate": 3.096177326946558e-05, + "loss": 109.5408, + "step": 5237, + "task_loss": 2.3927104473114014 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9930146133233549, + "compression/movement_sparsity/importance_threshold": -4.892379589552676e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9168113306684249, + "compression/movement_sparsity/model_sparsity": 0.8853160410184151, + "compression_loss": 105.1685791015625, + "distillation_loss": 4.418153762817383, + "epoch": 4.43, + "learning_rate": 3.0957077110923267e-05, + "loss": 109.0223, + "step": 5238, + "task_loss": 2.2882320880889893 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9930454311612219, + "compression/movement_sparsity/importance_threshold": -4.8707955931393335e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9168251030820443, + "compression/movement_sparsity/model_sparsity": 0.8853293403072577, + "compression_loss": 105.17120361328125, + "distillation_loss": 5.473730087280273, + "epoch": 4.43, + "learning_rate": 3.095238095238095e-05, + "loss": 109.5473, + "step": 5239, + "task_loss": 3.9921133518218994 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9930761582249636, + "compression/movement_sparsity/importance_threshold": -4.849275172516109e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9168887065922141, + "compression/movement_sparsity/model_sparsity": 0.8853907588411859, + "compression_loss": 105.17384338378906, + "distillation_loss": 4.805716514587402, + "epoch": 4.43, + "learning_rate": 3.0947684793838646e-05, + "loss": 108.8276, + "step": 5240, + "task_loss": 2.215390682220459 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9931067946484651, + "compression/movement_sparsity/importance_threshold": -4.8278182339136534e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9169547426325815, + "compression/movement_sparsity/model_sparsity": 0.885454526340416, + "compression_loss": 105.17639923095703, + "distillation_loss": 3.5674562454223633, + "epoch": 4.43, + "learning_rate": 3.0942988635296326e-05, + "loss": 109.5402, + "step": 5241, + "task_loss": 2.259798526763916 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9931373405656115, + "compression/movement_sparsity/importance_threshold": -4.806424683562183e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9169637096066436, + "compression/movement_sparsity/model_sparsity": 0.8854631852713335, + "compression_loss": 105.17903137207031, + "distillation_loss": 3.3868844509124756, + "epoch": 4.43, + "learning_rate": 3.093829247675402e-05, + "loss": 108.8692, + "step": 5242, + "task_loss": 2.981285572052002 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.993167796110288, + "compression/movement_sparsity/importance_threshold": -4.785094427692087e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9170276708418424, + "compression/movement_sparsity/model_sparsity": 0.8855249492413354, + "compression_loss": 105.18157958984375, + "distillation_loss": 4.184350490570068, + "epoch": 4.43, + "learning_rate": 3.0933596318211705e-05, + "loss": 109.3318, + "step": 5243, + "task_loss": 1.5083893537521362 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9931981614163798, + "compression/movement_sparsity/importance_threshold": -4.763827372533842e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9170235450798404, + "compression/movement_sparsity/model_sparsity": 0.8855209652119506, + "compression_loss": 105.18406677246094, + "distillation_loss": 2.452932834625244, + "epoch": 4.43, + "learning_rate": 3.092890015966939e-05, + "loss": 108.6115, + "step": 5244, + "task_loss": 1.7664517164230347 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9932284366177719, + "compression/movement_sparsity/importance_threshold": -4.742623424317839e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9170272177234723, + "compression/movement_sparsity/model_sparsity": 0.8855245116889753, + "compression_loss": 105.18659973144531, + "distillation_loss": 3.682135820388794, + "epoch": 4.43, + "learning_rate": 3.092420400112708e-05, + "loss": 109.0627, + "step": 5245, + "task_loss": 2.246311664581299 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9932586218483495, + "compression/movement_sparsity/importance_threshold": -4.721482489274554e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9170278735526922, + "compression/movement_sparsity/model_sparsity": 0.8855251449884439, + "compression_loss": 105.1890640258789, + "distillation_loss": 3.267014741897583, + "epoch": 4.43, + "learning_rate": 3.0919507842584764e-05, + "loss": 108.9767, + "step": 5246, + "task_loss": 1.7679287195205688 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9932887172419977, + "compression/movement_sparsity/importance_threshold": -4.7004044736341163e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9170059450084099, + "compression/movement_sparsity/model_sparsity": 0.8855039697571178, + "compression_loss": 105.19158172607422, + "distillation_loss": 4.145192623138428, + "epoch": 4.44, + "learning_rate": 3.091481168404246e-05, + "loss": 108.6701, + "step": 5247, + "task_loss": 2.332385540008545 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9933187229326018, + "compression/movement_sparsity/importance_threshold": -4.6793892836271764e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9170852884198589, + "compression/movement_sparsity/model_sparsity": 0.8855805874782947, + "compression_loss": 105.19409942626953, + "distillation_loss": 4.399828910827637, + "epoch": 4.44, + "learning_rate": 3.0910115525500144e-05, + "loss": 109.2916, + "step": 5248, + "task_loss": 2.642291307449341 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9933486390540468, + "compression/movement_sparsity/importance_threshold": -4.658436825484037e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.917103949742209, + "compression/movement_sparsity/model_sparsity": 0.8855986077268131, + "compression_loss": 105.19660186767578, + "distillation_loss": 5.788728713989258, + "epoch": 4.44, + "learning_rate": 3.090541936695783e-05, + "loss": 109.7256, + "step": 5249, + "task_loss": 2.857025623321533 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9933784657402178, + "compression/movement_sparsity/importance_threshold": -4.6375470054351754e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9170873513008599, + "compression/movement_sparsity/model_sparsity": 0.8855825794929871, + "compression_loss": 105.19908142089844, + "distillation_loss": 3.5973081588745117, + "epoch": 4.44, + "learning_rate": 3.0900723208415516e-05, + "loss": 109.228, + "step": 5250, + "task_loss": 2.441951274871826 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.993408203125, + "compression/movement_sparsity/importance_threshold": -4.616719729710894e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9171206435768993, + "compression/movement_sparsity/model_sparsity": 0.8856147280769254, + "compression_loss": 105.2015609741211, + "distillation_loss": 3.9575893878936768, + "epoch": 4.44, + "learning_rate": 3.08960270498732e-05, + "loss": 109.0105, + "step": 5251, + "task_loss": 2.564823627471924 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9934378513422786, + "compression/movement_sparsity/importance_threshold": -4.595954904541583e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9171166489807412, + "compression/movement_sparsity/model_sparsity": 0.8856108707074343, + "compression_loss": 105.2041015625, + "distillation_loss": 4.515467166900635, + "epoch": 4.44, + "learning_rate": 3.0891330891330896e-05, + "loss": 109.6784, + "step": 5252, + "task_loss": 1.8135656118392944 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9934674105259387, + "compression/movement_sparsity/importance_threshold": -4.5752524361577195e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9171779392023895, + "compression/movement_sparsity/model_sparsity": 0.8856700554214183, + "compression_loss": 105.2065658569336, + "distillation_loss": 4.74397087097168, + "epoch": 4.44, + "learning_rate": 3.088663473278858e-05, + "loss": 109.1572, + "step": 5253, + "task_loss": 2.8507578372955322 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9934968808098653, + "compression/movement_sparsity/importance_threshold": -4.5546122307896925e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9172048162762408, + "compression/movement_sparsity/model_sparsity": 0.8856960091850992, + "compression_loss": 105.20906066894531, + "distillation_loss": 3.138256072998047, + "epoch": 4.44, + "learning_rate": 3.088193857424627e-05, + "loss": 108.8121, + "step": 5254, + "task_loss": 2.1193490028381348 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9935262623279438, + "compression/movement_sparsity/importance_threshold": -4.5340341946678055e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9172208781300463, + "compression/movement_sparsity/model_sparsity": 0.8857115192648144, + "compression_loss": 105.21150970458984, + "distillation_loss": 5.062928199768066, + "epoch": 4.44, + "learning_rate": 3.0877242415703955e-05, + "loss": 109.5122, + "step": 5255, + "task_loss": 2.8871235847473145 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9935555552140591, + "compression/movement_sparsity/importance_threshold": -4.513518234022622e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9172792946272944, + "compression/movement_sparsity/model_sparsity": 0.8857679289756718, + "compression_loss": 105.21401977539062, + "distillation_loss": 5.650177955627441, + "epoch": 4.44, + "learning_rate": 3.087254625716164e-05, + "loss": 109.7726, + "step": 5256, + "task_loss": 2.3311243057250977 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9935847596020966, + "compression/movement_sparsity/importance_threshold": -4.493064255084358e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9173170942387001, + "compression/movement_sparsity/model_sparsity": 0.8858044300541406, + "compression_loss": 105.21644592285156, + "distillation_loss": 5.053370475769043, + "epoch": 4.44, + "learning_rate": 3.0867850098619334e-05, + "loss": 109.4565, + "step": 5257, + "task_loss": 2.589144706726074 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9936138756259413, + "compression/movement_sparsity/importance_threshold": -4.4726721640834036e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9173144589976525, + "compression/movement_sparsity/model_sparsity": 0.88580188534173, + "compression_loss": 105.21892547607422, + "distillation_loss": 5.526743412017822, + "epoch": 4.44, + "learning_rate": 3.0863153940077014e-05, + "loss": 109.1872, + "step": 5258, + "task_loss": 2.6437430381774902 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9936429034194781, + "compression/movement_sparsity/importance_threshold": -4.452341867250409e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.917342409246591, + "compression/movement_sparsity/model_sparsity": 0.8858288754136323, + "compression_loss": 105.22139739990234, + "distillation_loss": 4.86757755279541, + "epoch": 4.45, + "learning_rate": 3.085845778153471e-05, + "loss": 109.1638, + "step": 5259, + "task_loss": 2.7847962379455566 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9936718431165926, + "compression/movement_sparsity/importance_threshold": -4.4320732708155036e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9173501003347162, + "compression/movement_sparsity/model_sparsity": 0.8858363022892198, + "compression_loss": 105.22379302978516, + "distillation_loss": 4.006549835205078, + "epoch": 4.45, + "learning_rate": 3.085376162299239e-05, + "loss": 109.687, + "step": 5260, + "task_loss": 2.1678307056427 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9937006948511696, + "compression/movement_sparsity/importance_threshold": -4.411866281009251e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9173517220215146, + "compression/movement_sparsity/model_sparsity": 0.8858378682660879, + "compression_loss": 105.22624206542969, + "distillation_loss": 5.853924751281738, + "epoch": 4.45, + "learning_rate": 3.0849065464450086e-05, + "loss": 109.7367, + "step": 5261, + "task_loss": 3.138742446899414 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9937294587570944, + "compression/movement_sparsity/importance_threshold": -4.391720804061954e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9174298133953619, + "compression/movement_sparsity/model_sparsity": 0.8859132769610063, + "compression_loss": 105.22869873046875, + "distillation_loss": 2.8816800117492676, + "epoch": 4.45, + "learning_rate": 3.0844369305907766e-05, + "loss": 108.588, + "step": 5262, + "task_loss": 2.378624200820923 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9937581349682522, + "compression/movement_sparsity/importance_threshold": -4.371636746204003e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9174160409817425, + "compression/movement_sparsity/model_sparsity": 0.8858999776721637, + "compression_loss": 105.23108673095703, + "distillation_loss": 4.33465576171875, + "epoch": 4.45, + "learning_rate": 3.083967314736545e-05, + "loss": 108.9621, + "step": 5263, + "task_loss": 2.159965991973877 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9937867236185278, + "compression/movement_sparsity/importance_threshold": -4.351614013665961e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9174656216707724, + "compression/movement_sparsity/model_sparsity": 0.8859478551119974, + "compression_loss": 105.23346710205078, + "distillation_loss": 3.8796310424804688, + "epoch": 4.45, + "learning_rate": 3.0834976988823145e-05, + "loss": 109.4398, + "step": 5264, + "task_loss": 2.3852086067199707 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9938152248418067, + "compression/movement_sparsity/importance_threshold": -4.3316525126779575e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9174593376344283, + "compression/movement_sparsity/model_sparsity": 0.8859417869516336, + "compression_loss": 105.23591613769531, + "distillation_loss": 5.581107139587402, + "epoch": 4.45, + "learning_rate": 3.083028083028083e-05, + "loss": 109.9642, + "step": 5265, + "task_loss": 3.0037808418273926 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9938436387719739, + "compression/movement_sparsity/importance_threshold": -4.311752149470643e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9175175752691619, + "compression/movement_sparsity/model_sparsity": 0.8859980239444541, + "compression_loss": 105.23829650878906, + "distillation_loss": 4.918869495391846, + "epoch": 4.45, + "learning_rate": 3.082558467173852e-05, + "loss": 109.7805, + "step": 5266, + "task_loss": 2.8536181449890137 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9938719655429147, + "compression/movement_sparsity/importance_threshold": -4.291912830274233e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9175291178634334, + "compression/movement_sparsity/model_sparsity": 0.8860091700151033, + "compression_loss": 105.24070739746094, + "distillation_loss": 4.777923583984375, + "epoch": 4.45, + "learning_rate": 3.0820888513196204e-05, + "loss": 109.757, + "step": 5267, + "task_loss": 3.8504798412323 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9939002052885139, + "compression/movement_sparsity/importance_threshold": -4.272134461319292e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9175766833681328, + "compression/movement_sparsity/model_sparsity": 0.8860551014983877, + "compression_loss": 105.24308776855469, + "distillation_loss": 5.100000381469727, + "epoch": 4.45, + "learning_rate": 3.08161923546539e-05, + "loss": 109.7757, + "step": 5268, + "task_loss": 3.0352768898010254 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9939283581426569, + "compression/movement_sparsity/importance_threshold": -4.252416948836036e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9176074357964658, + "compression/movement_sparsity/model_sparsity": 0.8860847974862018, + "compression_loss": 105.24553680419922, + "distillation_loss": 6.948433876037598, + "epoch": 4.45, + "learning_rate": 3.0811496196111584e-05, + "loss": 110.2996, + "step": 5269, + "task_loss": 3.181124210357666 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9939564242392288, + "compression/movement_sparsity/importance_threshold": -4.232760199054941e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.917685002506937, + "compression/movement_sparsity/model_sparsity": 0.8861596995415453, + "compression_loss": 105.24791717529297, + "distillation_loss": 4.228874206542969, + "epoch": 4.45, + "learning_rate": 3.080680003756927e-05, + "loss": 109.5546, + "step": 5270, + "task_loss": 1.6805094480514526 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9939844037121147, + "compression/movement_sparsity/importance_threshold": -4.213164118206485e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9176515313683832, + "compression/movement_sparsity/model_sparsity": 0.88612737823957, + "compression_loss": 105.25028991699219, + "distillation_loss": 2.8455381393432617, + "epoch": 4.46, + "learning_rate": 3.0802103879026956e-05, + "loss": 108.9509, + "step": 5271, + "task_loss": 2.339909076690674 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9940122966951997, + "compression/movement_sparsity/importance_threshold": -4.193628612520969e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9176898675673324, + "compression/movement_sparsity/model_sparsity": 0.8861643974721495, + "compression_loss": 105.25262451171875, + "distillation_loss": 3.362929582595825, + "epoch": 4.46, + "learning_rate": 3.079740772048464e-05, + "loss": 109.5408, + "step": 5272, + "task_loss": 2.3124594688415527 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9940401033223691, + "compression/movement_sparsity/importance_threshold": -4.174153588228785e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9176952811394391, + "compression/movement_sparsity/model_sparsity": 0.8861696250714002, + "compression_loss": 105.2550048828125, + "distillation_loss": 5.497345924377441, + "epoch": 4.46, + "learning_rate": 3.0792711561942336e-05, + "loss": 109.056, + "step": 5273, + "task_loss": 3.7860612869262695 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9940678237275079, + "compression/movement_sparsity/importance_threshold": -4.154738951560495e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9176756420353428, + "compression/movement_sparsity/model_sparsity": 0.8861506606309466, + "compression_loss": 105.25736999511719, + "distillation_loss": 3.5265634059906006, + "epoch": 4.46, + "learning_rate": 3.078801540340002e-05, + "loss": 108.8464, + "step": 5274, + "task_loss": 2.1905648708343506 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9940954580445013, + "compression/movement_sparsity/importance_threshold": -4.1353846087462294e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9177010047399043, + "compression/movement_sparsity/model_sparsity": 0.8861751520485815, + "compression_loss": 105.25968170166016, + "distillation_loss": 4.278606414794922, + "epoch": 4.46, + "learning_rate": 3.078331924485771e-05, + "loss": 108.987, + "step": 5275, + "task_loss": 2.090707302093506 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9941230064072344, + "compression/movement_sparsity/importance_threshold": -4.116090466016638e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9177302427989474, + "compression/movement_sparsity/model_sparsity": 0.8862033856903498, + "compression_loss": 105.26197052001953, + "distillation_loss": 3.99467396736145, + "epoch": 4.46, + "learning_rate": 3.0778623086315395e-05, + "loss": 109.8677, + "step": 5276, + "task_loss": 2.405158281326294 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9941504689495924, + "compression/movement_sparsity/importance_threshold": -4.096856429601937e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9177634396816458, + "compression/movement_sparsity/model_sparsity": 0.8862354421580018, + "compression_loss": 105.26428985595703, + "distillation_loss": 3.988898754119873, + "epoch": 4.46, + "learning_rate": 3.077392692777308e-05, + "loss": 108.9139, + "step": 5277, + "task_loss": 2.250352621078491 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9941778458054604, + "compression/movement_sparsity/importance_threshold": -4.077682405732516e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9177769259152418, + "compression/movement_sparsity/model_sparsity": 0.8862484650979854, + "compression_loss": 105.26655578613281, + "distillation_loss": 3.4501352310180664, + "epoch": 4.46, + "learning_rate": 3.0769230769230774e-05, + "loss": 108.8478, + "step": 5278, + "task_loss": 1.312605619430542 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9942051371087235, + "compression/movement_sparsity/importance_threshold": -4.058568300638939e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9178341380715588, + "compression/movement_sparsity/model_sparsity": 0.8863037118407276, + "compression_loss": 105.26884460449219, + "distillation_loss": 4.382536888122559, + "epoch": 4.46, + "learning_rate": 3.0764534610688454e-05, + "loss": 109.1438, + "step": 5279, + "task_loss": 2.3028433322906494 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9942323429932669, + "compression/movement_sparsity/importance_threshold": -4.0395140205515094e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9178003569046463, + "compression/movement_sparsity/model_sparsity": 0.8862710911608217, + "compression_loss": 105.27108764648438, + "distillation_loss": 5.245271682739258, + "epoch": 4.46, + "learning_rate": 3.075983845214615e-05, + "loss": 108.9027, + "step": 5280, + "task_loss": 2.6318256855010986 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9942594635929758, + "compression/movement_sparsity/importance_threshold": -4.0205194717005295e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9178585706910446, + "compression/movement_sparsity/model_sparsity": 0.8863273051245706, + "compression_loss": 105.27335357666016, + "distillation_loss": 4.9941725730896, + "epoch": 4.46, + "learning_rate": 3.075514229360383e-05, + "loss": 109.4201, + "step": 5281, + "task_loss": 2.400455951690674 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9942864990417352, + "compression/movement_sparsity/importance_threshold": -4.00158456031665e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9178685631435235, + "compression/movement_sparsity/model_sparsity": 0.8863369543055665, + "compression_loss": 105.27555084228516, + "distillation_loss": 5.163814067840576, + "epoch": 4.46, + "learning_rate": 3.075044613506152e-05, + "loss": 109.0704, + "step": 5282, + "task_loss": 3.985599994659424 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9943134494734303, + "compression/movement_sparsity/importance_threshold": -3.98270919263e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9178773035584006, + "compression/movement_sparsity/model_sparsity": 0.8863453944603039, + "compression_loss": 105.27781677246094, + "distillation_loss": 3.9177510738372803, + "epoch": 4.47, + "learning_rate": 3.074574997651921e-05, + "loss": 109.3048, + "step": 5283, + "task_loss": 2.3960378170013428 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9943403150219462, + "compression/movement_sparsity/importance_threshold": -3.963893274871143e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9178436297089969, + "compression/movement_sparsity/model_sparsity": 0.8863128774112201, + "compression_loss": 105.280029296875, + "distillation_loss": 5.306855201721191, + "epoch": 4.47, + "learning_rate": 3.074105381797689e-05, + "loss": 110.1449, + "step": 5284, + "task_loss": 2.90151047706604 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9943670958211681, + "compression/movement_sparsity/importance_threshold": -3.945136713270469e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9178832656422184, + "compression/movement_sparsity/model_sparsity": 0.8863511517282011, + "compression_loss": 105.28221893310547, + "distillation_loss": 3.8354897499084473, + "epoch": 4.47, + "learning_rate": 3.0736357659434586e-05, + "loss": 109.723, + "step": 5285, + "task_loss": 2.0803427696228027 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9943937920049812, + "compression/movement_sparsity/importance_threshold": -3.926439414058281e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.917800368828814, + "compression/movement_sparsity/model_sparsity": 0.8862711026753575, + "compression_loss": 105.28441619873047, + "distillation_loss": 3.583292007446289, + "epoch": 4.47, + "learning_rate": 3.073166150089227e-05, + "loss": 109.135, + "step": 5286, + "task_loss": 2.5634682178497314 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9944204037072706, + "compression/movement_sparsity/importance_threshold": -3.907801283464969e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9177729790157544, + "compression/movement_sparsity/model_sparsity": 0.8862446537866374, + "compression_loss": 105.2865982055664, + "distillation_loss": 3.181723117828369, + "epoch": 4.47, + "learning_rate": 3.072696534234996e-05, + "loss": 109.4952, + "step": 5287, + "task_loss": 1.8665926456451416 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9944469310619213, + "compression/movement_sparsity/importance_threshold": -3.8892222277211834e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.917788873931213, + "compression/movement_sparsity/model_sparsity": 0.8862600026628515, + "compression_loss": 105.2886962890625, + "distillation_loss": 3.6482009887695312, + "epoch": 4.47, + "learning_rate": 3.0722269183807645e-05, + "loss": 109.0572, + "step": 5288, + "task_loss": 2.057821035385132 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9944733742028186, + "compression/movement_sparsity/importance_threshold": -3.8707021530569666e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9178495440961443, + "compression/movement_sparsity/model_sparsity": 0.8863185886209741, + "compression_loss": 105.29085540771484, + "distillation_loss": 4.0911545753479, + "epoch": 4.47, + "learning_rate": 3.071757302526533e-05, + "loss": 109.0379, + "step": 5289, + "task_loss": 2.0330793857574463 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9944997332638476, + "compression/movement_sparsity/importance_threshold": -3.852240965702969e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9179039183005638, + "compression/movement_sparsity/model_sparsity": 0.8863710949041973, + "compression_loss": 105.29296875, + "distillation_loss": 4.281589031219482, + "epoch": 4.47, + "learning_rate": 3.0712876866723024e-05, + "loss": 109.2861, + "step": 5290, + "task_loss": 1.6291879415512085 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9945260083788934, + "compression/movement_sparsity/importance_threshold": -3.8338385718894065e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9180227426310552, + "compression/movement_sparsity/model_sparsity": 0.8864858372533899, + "compression_loss": 105.29511260986328, + "distillation_loss": 3.5341243743896484, + "epoch": 4.47, + "learning_rate": 3.070818070818071e-05, + "loss": 110.1602, + "step": 5291, + "task_loss": 2.307964324951172 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9945521996818412, + "compression/movement_sparsity/importance_threshold": -3.815494877846843e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9180179848881684, + "compression/movement_sparsity/model_sparsity": 0.8864812429536079, + "compression_loss": 105.29718017578125, + "distillation_loss": 3.4971156120300293, + "epoch": 4.47, + "learning_rate": 3.07034845496384e-05, + "loss": 109.9427, + "step": 5292, + "task_loss": 2.1439735889434814 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9945783073065761, + "compression/movement_sparsity/importance_threshold": -3.7972097898054946e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9180805271474186, + "compression/movement_sparsity/model_sparsity": 0.8865416366938502, + "compression_loss": 105.29926300048828, + "distillation_loss": 3.723022222518921, + "epoch": 4.47, + "learning_rate": 3.069878839109608e-05, + "loss": 108.9296, + "step": 5293, + "task_loss": 1.5795183181762695 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9946043313869832, + "compression/movement_sparsity/importance_threshold": -3.7789832139960115e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.918141912762408, + "compression/movement_sparsity/model_sparsity": 0.8866009135241205, + "compression_loss": 105.30138397216797, + "distillation_loss": 5.6574530601501465, + "epoch": 4.47, + "learning_rate": 3.0694092232553776e-05, + "loss": 110.1239, + "step": 5294, + "task_loss": 2.6224288940429688 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9946302720569479, + "compression/movement_sparsity/importance_threshold": -3.7608150566485234e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9182028333348596, + "compression/movement_sparsity/model_sparsity": 0.8866597412874948, + "compression_loss": 105.30339050292969, + "distillation_loss": 3.7984118461608887, + "epoch": 4.48, + "learning_rate": 3.068939607401146e-05, + "loss": 109.015, + "step": 5295, + "task_loss": 2.6459646224975586 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.994656129450355, + "compression/movement_sparsity/importance_threshold": -3.7427052239935935e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9182601170361823, + "compression/movement_sparsity/model_sparsity": 0.8867150571174518, + "compression_loss": 105.30545806884766, + "distillation_loss": 4.660604953765869, + "epoch": 4.48, + "learning_rate": 3.068469991546914e-05, + "loss": 109.3836, + "step": 5296, + "task_loss": 2.5265064239501953 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9946819037010897, + "compression/movement_sparsity/importance_threshold": -3.7246536222616984e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9183058819915687, + "compression/movement_sparsity/model_sparsity": 0.8867592499058312, + "compression_loss": 105.30752563476562, + "distillation_loss": 4.230428218841553, + "epoch": 4.48, + "learning_rate": 3.0680003756926835e-05, + "loss": 109.076, + "step": 5297, + "task_loss": 2.614109992980957 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9947075949430373, + "compression/movement_sparsity/importance_threshold": -3.706660157682968e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9183171622541523, + "compression/movement_sparsity/model_sparsity": 0.8867701426566929, + "compression_loss": 105.30957794189453, + "distillation_loss": 3.8527121543884277, + "epoch": 4.48, + "learning_rate": 3.067530759838452e-05, + "loss": 109.2545, + "step": 5298, + "task_loss": 2.9995381832122803 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9947332033100829, + "compression/movement_sparsity/importance_threshold": -3.688724736487965e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9183194755426737, + "compression/movement_sparsity/model_sparsity": 0.886772376476637, + "compression_loss": 105.31165313720703, + "distillation_loss": 3.8960328102111816, + "epoch": 4.48, + "learning_rate": 3.0670611439842215e-05, + "loss": 109.3, + "step": 5299, + "task_loss": 2.283010244369507 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9947587289361115, + "compression/movement_sparsity/importance_threshold": -3.67084726490708e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9184033024411538, + "compression/movement_sparsity/model_sparsity": 0.8868533236632726, + "compression_loss": 105.31376647949219, + "distillation_loss": 4.463361740112305, + "epoch": 4.48, + "learning_rate": 3.06659152812999e-05, + "loss": 108.9276, + "step": 5300, + "task_loss": 1.5154794454574585 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9947841719550083, + "compression/movement_sparsity/importance_threshold": -3.65302764917079e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9185040735818444, + "compression/movement_sparsity/model_sparsity": 0.8869506330052723, + "compression_loss": 105.3158187866211, + "distillation_loss": 4.496820449829102, + "epoch": 4.48, + "learning_rate": 3.066121912275759e-05, + "loss": 109.5484, + "step": 5301, + "task_loss": 3.04172420501709 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9948095325006586, + "compression/movement_sparsity/importance_threshold": -3.63526579550931e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9184917916891795, + "compression/movement_sparsity/model_sparsity": 0.886938773033404, + "compression_loss": 105.3178939819336, + "distillation_loss": 4.582427978515625, + "epoch": 4.48, + "learning_rate": 3.0656522964215274e-05, + "loss": 109.8135, + "step": 5302, + "task_loss": 2.1595518589019775 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9948348107069473, + "compression/movement_sparsity/importance_threshold": -3.617561610153204e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9184646403594726, + "compression/movement_sparsity/model_sparsity": 0.8869125544353997, + "compression_loss": 105.31996154785156, + "distillation_loss": 4.720130920410156, + "epoch": 4.48, + "learning_rate": 3.065182680567296e-05, + "loss": 109.5574, + "step": 5303, + "task_loss": 3.4044392108917236 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9948600067077598, + "compression/movement_sparsity/importance_threshold": -3.5999149993327755e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9185295674522499, + "compression/movement_sparsity/model_sparsity": 0.8869752510828011, + "compression_loss": 105.32205963134766, + "distillation_loss": 5.022746562957764, + "epoch": 4.48, + "learning_rate": 3.064713064713065e-05, + "loss": 110.2888, + "step": 5304, + "task_loss": 2.9065051078796387 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.994885120636981, + "compression/movement_sparsity/importance_threshold": -3.582325869278414e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9185984891411852, + "compression/movement_sparsity/model_sparsity": 0.8870418050996934, + "compression_loss": 105.32415771484375, + "distillation_loss": 4.093623161315918, + "epoch": 4.48, + "learning_rate": 3.064243448858833e-05, + "loss": 108.9882, + "step": 5305, + "task_loss": 2.3920021057128906 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9949101526284962, + "compression/movement_sparsity/importance_threshold": -3.5647941262205093e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9185527003374635, + "compression/movement_sparsity/model_sparsity": 0.8869975892822425, + "compression_loss": 105.32622528076172, + "distillation_loss": 3.9799437522888184, + "epoch": 4.48, + "learning_rate": 3.0637738330046026e-05, + "loss": 109.1541, + "step": 5306, + "task_loss": 2.2192273139953613 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9949351028161905, + "compression/movement_sparsity/importance_threshold": -3.547319676389538e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9186018159839556, + "compression/movement_sparsity/model_sparsity": 0.8870450176551802, + "compression_loss": 105.3282699584961, + "distillation_loss": 4.50498104095459, + "epoch": 4.49, + "learning_rate": 3.063304217150371e-05, + "loss": 109.4839, + "step": 5307, + "task_loss": 2.276562213897705 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.994959971333949, + "compression/movement_sparsity/importance_threshold": -3.5299024260158907e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.918594172592501, + "compression/movement_sparsity/model_sparsity": 0.8870376368377358, + "compression_loss": 105.33024597167969, + "distillation_loss": 3.230168342590332, + "epoch": 4.49, + "learning_rate": 3.06283460129614e-05, + "loss": 109.4122, + "step": 5308, + "task_loss": 1.8821237087249756 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9949847583156568, + "compression/movement_sparsity/importance_threshold": -3.5125422813298696e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9186003493113364, + "compression/movement_sparsity/model_sparsity": 0.8870436013672774, + "compression_loss": 105.3322525024414, + "distillation_loss": 4.47476863861084, + "epoch": 4.49, + "learning_rate": 3.0623649854419085e-05, + "loss": 110.2725, + "step": 5309, + "task_loss": 3.0541789531707764 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9950094638951992, + "compression/movement_sparsity/importance_threshold": -3.4952391485619516e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9185686667979279, + "compression/movement_sparsity/model_sparsity": 0.8870130072456713, + "compression_loss": 105.3342514038086, + "distillation_loss": 4.209713935852051, + "epoch": 4.49, + "learning_rate": 3.061895369587677e-05, + "loss": 109.6739, + "step": 5310, + "task_loss": 2.536536455154419 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9950340882064611, + "compression/movement_sparsity/importance_threshold": -3.4779929339426134e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9185996338612782, + "compression/movement_sparsity/model_sparsity": 0.8870429104951297, + "compression_loss": 105.33627319335938, + "distillation_loss": 4.76332426071167, + "epoch": 4.49, + "learning_rate": 3.0614257537334464e-05, + "loss": 109.6265, + "step": 5311, + "task_loss": 2.166757583618164 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9950586313833278, + "compression/movement_sparsity/importance_threshold": -3.460803543702158e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9186276914277254, + "compression/movement_sparsity/model_sparsity": 0.8870700041978543, + "compression_loss": 105.3382339477539, + "distillation_loss": 4.837841033935547, + "epoch": 4.49, + "learning_rate": 3.060956137879215e-05, + "loss": 109.2596, + "step": 5312, + "task_loss": 1.9219720363616943 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9950830935596846, + "compression/movement_sparsity/importance_threshold": -3.443670884070802e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9186098409487745, + "compression/movement_sparsity/model_sparsity": 0.8870527669377699, + "compression_loss": 105.3401870727539, + "distillation_loss": 4.139411449432373, + "epoch": 4.49, + "learning_rate": 3.060486522024984e-05, + "loss": 109.3898, + "step": 5313, + "task_loss": 1.2577857971191406 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9951074748694163, + "compression/movement_sparsity/importance_threshold": -3.426594861279282e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9185456412302231, + "compression/movement_sparsity/model_sparsity": 0.8869907726770521, + "compression_loss": 105.34212493896484, + "distillation_loss": 4.207692623138428, + "epoch": 4.49, + "learning_rate": 3.060016906170752e-05, + "loss": 109.8421, + "step": 5314, + "task_loss": 2.4605307579040527 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9951317754464083, + "compression/movement_sparsity/importance_threshold": -3.4095753815578146e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9186028772348752, + "compression/movement_sparsity/model_sparsity": 0.8870460424488659, + "compression_loss": 105.34404754638672, + "distillation_loss": 4.348567485809326, + "epoch": 4.49, + "learning_rate": 3.059547290316521e-05, + "loss": 110.3525, + "step": 5315, + "task_loss": 2.480313777923584 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9951559954245457, + "compression/movement_sparsity/importance_threshold": -3.3926123511367895e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9185856468126413, + "compression/movement_sparsity/model_sparsity": 0.8870294039446428, + "compression_loss": 105.34600067138672, + "distillation_loss": 4.381779670715332, + "epoch": 4.49, + "learning_rate": 3.05907767446229e-05, + "loss": 109.9789, + "step": 5316, + "task_loss": 2.176074981689453 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9951801349377135, + "compression/movement_sparsity/importance_threshold": -3.3757056762465966e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9186402237279108, + "compression/movement_sparsity/model_sparsity": 0.8870821059749744, + "compression_loss": 105.34786987304688, + "distillation_loss": 3.0911898612976074, + "epoch": 4.49, + "learning_rate": 3.058608058608059e-05, + "loss": 109.0962, + "step": 5317, + "task_loss": 2.878769874572754 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9952041941197969, + "compression/movement_sparsity/importance_threshold": -3.358855263117799e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9185957823551318, + "compression/movement_sparsity/model_sparsity": 0.8870391913000681, + "compression_loss": 105.34982299804688, + "distillation_loss": 4.0792717933654785, + "epoch": 4.5, + "learning_rate": 3.0581384427538275e-05, + "loss": 109.5954, + "step": 5318, + "task_loss": 2.5904035568237305 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9952281731046811, + "compression/movement_sparsity/importance_threshold": -3.342061017980527e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9185010209949297, + "compression/movement_sparsity/model_sparsity": 0.8869476852841089, + "compression_loss": 105.35173797607422, + "distillation_loss": 6.36912727355957, + "epoch": 4.5, + "learning_rate": 3.057668826899596e-05, + "loss": 109.8856, + "step": 5319, + "task_loss": 3.078721284866333 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9952520720262512, + "compression/movement_sparsity/importance_threshold": -3.325322847065517e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9185061960836837, + "compression/movement_sparsity/model_sparsity": 0.8869526825926437, + "compression_loss": 105.35358428955078, + "distillation_loss": 3.458751678466797, + "epoch": 4.5, + "learning_rate": 3.057199211045365e-05, + "loss": 109.4872, + "step": 5320, + "task_loss": 2.1588242053985596 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9952758910183924, + "compression/movement_sparsity/importance_threshold": -3.308640656602812e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.918523700761773, + "compression/movement_sparsity/model_sparsity": 0.8869695859311901, + "compression_loss": 105.35547637939453, + "distillation_loss": 3.0859713554382324, + "epoch": 4.5, + "learning_rate": 3.056729595191134e-05, + "loss": 109.0976, + "step": 5321, + "task_loss": 2.262476921081543 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9952996302149898, + "compression/movement_sparsity/importance_threshold": -3.292014352822975e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9185321072999564, + "compression/movement_sparsity/model_sparsity": 0.8869777036789253, + "compression_loss": 105.35736083984375, + "distillation_loss": 5.503211498260498, + "epoch": 4.5, + "learning_rate": 3.056259979336902e-05, + "loss": 109.3787, + "step": 5322, + "task_loss": 1.9152984619140625 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9953232897499285, + "compression/movement_sparsity/importance_threshold": -3.275443841956483e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9185563133602571, + "compression/movement_sparsity/model_sparsity": 0.8870010781865882, + "compression_loss": 105.35924530029297, + "distillation_loss": 3.88155198097229, + "epoch": 4.5, + "learning_rate": 3.0557903634826714e-05, + "loss": 108.9887, + "step": 5323, + "task_loss": 1.7157261371612549 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9953468697570936, + "compression/movement_sparsity/importance_threshold": -3.2589290302335525e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9185710277831198, + "compression/movement_sparsity/model_sparsity": 0.8870152871237587, + "compression_loss": 105.36111450195312, + "distillation_loss": 4.435579776763916, + "epoch": 4.5, + "learning_rate": 3.05532074762844e-05, + "loss": 109.7016, + "step": 5324, + "task_loss": 1.9408481121063232 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9953703703703703, + "compression/movement_sparsity/importance_threshold": -3.2424698238847464e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9186168165868415, + "compression/movement_sparsity/model_sparsity": 0.8870595029412097, + "compression_loss": 105.3630142211914, + "distillation_loss": 3.593562364578247, + "epoch": 4.5, + "learning_rate": 3.054851131774209e-05, + "loss": 109.1611, + "step": 5325, + "task_loss": 1.881563425064087 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.995393791723644, + "compression/movement_sparsity/importance_threshold": -3.226066129140368e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9186343331890986, + "compression/movement_sparsity/model_sparsity": 0.8870764177942919, + "compression_loss": 105.36489868164062, + "distillation_loss": 3.640774965286255, + "epoch": 4.5, + "learning_rate": 3.054381515919977e-05, + "loss": 109.4842, + "step": 5326, + "task_loss": 2.349426746368408 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9954171339507993, + "compression/movement_sparsity/importance_threshold": -3.2097178522308076e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9186680905076757, + "compression/movement_sparsity/model_sparsity": 0.8871090154451262, + "compression_loss": 105.36671447753906, + "distillation_loss": 3.9799606800079346, + "epoch": 4.5, + "learning_rate": 3.053911900065746e-05, + "loss": 109.231, + "step": 5327, + "task_loss": 1.6338565349578857 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9954403971857217, + "compression/movement_sparsity/importance_threshold": -3.193424899386628e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9186610314004353, + "compression/movement_sparsity/model_sparsity": 0.8871021988399358, + "compression_loss": 105.36857604980469, + "distillation_loss": 5.128190040588379, + "epoch": 4.5, + "learning_rate": 3.053442284211515e-05, + "loss": 109.4388, + "step": 5328, + "task_loss": 2.8053669929504395 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9954635815622964, + "compression/movement_sparsity/importance_threshold": -3.177187176837959e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9187238717638763, + "compression/movement_sparsity/model_sparsity": 0.887162880443573, + "compression_loss": 105.37039947509766, + "distillation_loss": 2.9506044387817383, + "epoch": 4.5, + "learning_rate": 3.052972668357284e-05, + "loss": 109.2275, + "step": 5329, + "task_loss": 1.4456278085708618 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9954866872144084, + "compression/movement_sparsity/importance_threshold": -3.161004590815364e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9187905040126255, + "compression/movement_sparsity/model_sparsity": 0.887227223669593, + "compression_loss": 105.37216186523438, + "distillation_loss": 5.158373832702637, + "epoch": 4.51, + "learning_rate": 3.052503052503053e-05, + "loss": 109.7285, + "step": 5330, + "task_loss": 2.8485584259033203 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9955097142759428, + "compression/movement_sparsity/importance_threshold": -3.144877047549233e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9188355773662891, + "compression/movement_sparsity/model_sparsity": 0.8872707486148963, + "compression_loss": 105.37397003173828, + "distillation_loss": 5.424570083618164, + "epoch": 4.51, + "learning_rate": 3.052033436648821e-05, + "loss": 110.2662, + "step": 5331, + "task_loss": 3.008532762527466 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9955326628807848, + "compression/movement_sparsity/importance_threshold": -3.1288044532698685e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9187537895004747, + "compression/movement_sparsity/model_sparsity": 0.8871917704138815, + "compression_loss": 105.3757553100586, + "distillation_loss": 5.057656764984131, + "epoch": 4.51, + "learning_rate": 3.05156382079459e-05, + "loss": 109.6675, + "step": 5332, + "task_loss": 2.7585439682006836 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9955555331628195, + "compression/movement_sparsity/importance_threshold": -3.1127867142078346e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9188274570081291, + "compression/movement_sparsity/model_sparsity": 0.8872629072160203, + "compression_loss": 105.37752532958984, + "distillation_loss": 3.6905813217163086, + "epoch": 4.51, + "learning_rate": 3.051094204940359e-05, + "loss": 108.8345, + "step": 5333, + "task_loss": 2.8215889930725098 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9955783252559322, + "compression/movement_sparsity/importance_threshold": -3.096823736593434e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9188260141838451, + "compression/movement_sparsity/model_sparsity": 0.8872615139571891, + "compression_loss": 105.37918090820312, + "distillation_loss": 3.648729085922241, + "epoch": 4.51, + "learning_rate": 3.050624589086128e-05, + "loss": 109.232, + "step": 5334, + "task_loss": 2.2307941913604736 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9956010392940078, + "compression/movement_sparsity/importance_threshold": -3.080915426657057e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.91883083154757, + "compression/movement_sparsity/model_sparsity": 0.8872661658296501, + "compression_loss": 105.38093566894531, + "distillation_loss": 5.115367889404297, + "epoch": 4.51, + "learning_rate": 3.0501549732318963e-05, + "loss": 109.2095, + "step": 5335, + "task_loss": 3.3497884273529053 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9956236754109316, + "compression/movement_sparsity/importance_threshold": -3.065061690629093e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9189094475847932, + "compression/movement_sparsity/model_sparsity": 0.8873420811641435, + "compression_loss": 105.38265228271484, + "distillation_loss": 2.999159812927246, + "epoch": 4.51, + "learning_rate": 3.049685357377665e-05, + "loss": 109.4159, + "step": 5336, + "task_loss": 1.4671930074691772 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9956462337405887, + "compression/movement_sparsity/importance_threshold": -3.049262434739932e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9189302791056531, + "compression/movement_sparsity/model_sparsity": 0.8873621970581765, + "compression_loss": 105.38439178466797, + "distillation_loss": 3.26348876953125, + "epoch": 4.51, + "learning_rate": 3.049215741523434e-05, + "loss": 109.7191, + "step": 5337, + "task_loss": 1.5933661460876465 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9956687144168642, + "compression/movement_sparsity/importance_threshold": -3.0335175652200508e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9189319723374574, + "compression/movement_sparsity/model_sparsity": 0.8873638321222594, + "compression_loss": 105.38603973388672, + "distillation_loss": 3.244957685470581, + "epoch": 4.51, + "learning_rate": 3.048746125669203e-05, + "loss": 109.095, + "step": 5338, + "task_loss": 3.0963120460510254 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9956911175736434, + "compression/movement_sparsity/importance_threshold": -3.0178269882997523e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9189316146124283, + "compression/movement_sparsity/model_sparsity": 0.8873634866861856, + "compression_loss": 105.38776397705078, + "distillation_loss": 5.366868019104004, + "epoch": 4.51, + "learning_rate": 3.0482765098149712e-05, + "loss": 109.9555, + "step": 5339, + "task_loss": 2.662236452102661 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9957134433448112, + "compression/movement_sparsity/importance_threshold": -3.0021906102096e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9189896137638092, + "compression/movement_sparsity/model_sparsity": 0.8874194933882902, + "compression_loss": 105.38941955566406, + "distillation_loss": 6.266535758972168, + "epoch": 4.51, + "learning_rate": 3.0478068939607402e-05, + "loss": 109.7656, + "step": 5340, + "task_loss": 3.578381299972534 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9957356918642528, + "compression/movement_sparsity/importance_threshold": -2.98660833717981e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.919012281606485, + "compression/movement_sparsity/model_sparsity": 0.8874413825208356, + "compression_loss": 105.39115905761719, + "distillation_loss": 4.859249114990234, + "epoch": 4.51, + "learning_rate": 3.047337278106509e-05, + "loss": 109.5688, + "step": 5341, + "task_loss": 2.8670389652252197 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9957578632658536, + "compression/movement_sparsity/importance_threshold": -2.9710800754407726e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9189787389229253, + "compression/movement_sparsity/model_sparsity": 0.8874089921316456, + "compression_loss": 105.39287567138672, + "distillation_loss": 7.30093240737915, + "epoch": 4.52, + "learning_rate": 3.0468676622522778e-05, + "loss": 110.5444, + "step": 5342, + "task_loss": 3.6034481525421143 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9957799576834985, + "compression/movement_sparsity/importance_threshold": -2.955605731222964e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9190306329004765, + "compression/movement_sparsity/model_sparsity": 0.8874591033914234, + "compression_loss": 105.3946304321289, + "distillation_loss": 3.868135929107666, + "epoch": 4.52, + "learning_rate": 3.0463980463980464e-05, + "loss": 109.5081, + "step": 5343, + "task_loss": 2.2713162899017334 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9958019752510725, + "compression/movement_sparsity/importance_threshold": -2.9401852107568613e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9190498665828731, + "compression/movement_sparsity/model_sparsity": 0.8874776763376601, + "compression_loss": 105.39637756347656, + "distillation_loss": 3.9130611419677734, + "epoch": 4.52, + "learning_rate": 3.045928430543815e-05, + "loss": 109.5001, + "step": 5344, + "task_loss": 2.5347094535827637 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9958239161024611, + "compression/movement_sparsity/importance_threshold": -2.9248184202726804e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9191046104364894, + "compression/movement_sparsity/model_sparsity": 0.8875305395714927, + "compression_loss": 105.39803314208984, + "distillation_loss": 4.526621341705322, + "epoch": 4.52, + "learning_rate": 3.045458814689584e-05, + "loss": 109.7642, + "step": 5345, + "task_loss": 3.4054384231567383 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9958457803715492, + "compression/movement_sparsity/importance_threshold": -2.909505266000898e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9191156879882231, + "compression/movement_sparsity/model_sparsity": 0.8875412365752459, + "compression_loss": 105.39976501464844, + "distillation_loss": 4.938938140869141, + "epoch": 4.52, + "learning_rate": 3.044989198835353e-05, + "loss": 110.0505, + "step": 5346, + "task_loss": 2.9246609210968018 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9958675681922221, + "compression/movement_sparsity/importance_threshold": -2.894245654171991e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.919059203206132, + "compression/movement_sparsity/model_sparsity": 0.8874866922191872, + "compression_loss": 105.40148162841797, + "distillation_loss": 4.814578056335449, + "epoch": 4.52, + "learning_rate": 3.044519582981122e-05, + "loss": 109.8968, + "step": 5347, + "task_loss": 2.663323163986206 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9958892796983647, + "compression/movement_sparsity/importance_threshold": -2.8790394910162623e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.918997412169443, + "compression/movement_sparsity/model_sparsity": 0.8874270238946999, + "compression_loss": 105.40314483642578, + "distillation_loss": 3.3414509296417236, + "epoch": 4.52, + "learning_rate": 3.0440499671268903e-05, + "loss": 109.0747, + "step": 5348, + "task_loss": 2.485527515411377 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9959109150238624, + "compression/movement_sparsity/importance_threshold": -2.8638866827641017e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9190376920077169, + "compression/movement_sparsity/model_sparsity": 0.8874659199966138, + "compression_loss": 105.4048080444336, + "distillation_loss": 4.4932861328125, + "epoch": 4.52, + "learning_rate": 3.043580351272659e-05, + "loss": 109.1396, + "step": 5349, + "task_loss": 3.003786087036133 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9959324743026002, + "compression/movement_sparsity/importance_threshold": -2.848787135645986e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9190383597611046, + "compression/movement_sparsity/model_sparsity": 0.8874665648106183, + "compression_loss": 105.40646362304688, + "distillation_loss": 4.153212547302246, + "epoch": 4.52, + "learning_rate": 3.043110735418428e-05, + "loss": 109.2598, + "step": 5350, + "task_loss": 3.814171075820923 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9959539576684633, + "compression/movement_sparsity/importance_threshold": -2.833740755892305e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.919076624415048, + "compression/movement_sparsity/model_sparsity": 0.887503514955983, + "compression_loss": 105.40805053710938, + "distillation_loss": 3.95697283744812, + "epoch": 4.52, + "learning_rate": 3.042641119564197e-05, + "loss": 108.8114, + "step": 5351, + "task_loss": 1.6072574853897095 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9959753652553368, + "compression/movement_sparsity/importance_threshold": -2.8187474497333616e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9190748834865732, + "compression/movement_sparsity/model_sparsity": 0.887501833833757, + "compression_loss": 105.4096450805664, + "distillation_loss": 4.411128997802734, + "epoch": 4.52, + "learning_rate": 3.042171503709965e-05, + "loss": 110.1013, + "step": 5352, + "task_loss": 1.7729027271270752 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9959966971971058, + "compression/movement_sparsity/importance_threshold": -2.8038071233996327e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9190943556523226, + "compression/movement_sparsity/model_sparsity": 0.8875206370707095, + "compression_loss": 105.41127014160156, + "distillation_loss": 3.273794174194336, + "epoch": 4.52, + "learning_rate": 3.041701887855734e-05, + "loss": 109.4814, + "step": 5353, + "task_loss": 1.678481101989746 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9960179536276556, + "compression/movement_sparsity/importance_threshold": -2.788919683121508e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9191358040590248, + "compression/movement_sparsity/model_sparsity": 0.8875606615971313, + "compression_loss": 105.41287231445312, + "distillation_loss": 4.126544952392578, + "epoch": 4.53, + "learning_rate": 3.041232272001503e-05, + "loss": 109.3074, + "step": 5354, + "task_loss": 3.0084991455078125 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.996039134680871, + "compression/movement_sparsity/importance_threshold": -2.7740850351294644e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9190977063434282, + "compression/movement_sparsity/model_sparsity": 0.8875238726552678, + "compression_loss": 105.41444396972656, + "distillation_loss": 3.646207809448242, + "epoch": 4.53, + "learning_rate": 3.0407626561472717e-05, + "loss": 109.6451, + "step": 5355, + "task_loss": 2.360541582107544 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9960602404906376, + "compression/movement_sparsity/importance_threshold": -2.759303085653631e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9191141259222628, + "compression/movement_sparsity/model_sparsity": 0.8875397281710569, + "compression_loss": 105.41600799560547, + "distillation_loss": 4.066474437713623, + "epoch": 4.53, + "learning_rate": 3.04029304029304e-05, + "loss": 109.2783, + "step": 5356, + "task_loss": 1.9582717418670654 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9960812711908402, + "compression/movement_sparsity/importance_threshold": -2.7445737409246586e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9191511266144369, + "compression/movement_sparsity/model_sparsity": 0.8875754577756273, + "compression_loss": 105.41756439208984, + "distillation_loss": 3.0736052989959717, + "epoch": 4.53, + "learning_rate": 3.039823424438809e-05, + "loss": 108.9415, + "step": 5357, + "task_loss": 2.2394940853118896 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9961022269153641, + "compression/movement_sparsity/importance_threshold": -2.7298969071729365e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9191919788127574, + "compression/movement_sparsity/model_sparsity": 0.8876149065752593, + "compression_loss": 105.41910552978516, + "distillation_loss": 4.717273712158203, + "epoch": 4.53, + "learning_rate": 3.039353808584578e-05, + "loss": 109.6797, + "step": 5358, + "task_loss": 2.720681667327881 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9961231077980944, + "compression/movement_sparsity/importance_threshold": -2.7152724906286813e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9192282879032085, + "compression/movement_sparsity/model_sparsity": 0.8876499683367537, + "compression_loss": 105.42064666748047, + "distillation_loss": 3.7186262607574463, + "epoch": 4.53, + "learning_rate": 3.038884192730347e-05, + "loss": 109.6702, + "step": 5359, + "task_loss": 2.33891224861145 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9961439139729162, + "compression/movement_sparsity/importance_threshold": -2.700700397522543e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9191980243657487, + "compression/movement_sparsity/model_sparsity": 0.8876207444449072, + "compression_loss": 105.42220306396484, + "distillation_loss": 4.532159805297852, + "epoch": 4.53, + "learning_rate": 3.0384145768761156e-05, + "loss": 109.5392, + "step": 5360, + "task_loss": 2.9750657081604004 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9961646455737146, + "compression/movement_sparsity/importance_threshold": -2.686180534084738e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9192018997202304, + "compression/movement_sparsity/model_sparsity": 0.8876244866690405, + "compression_loss": 105.42373657226562, + "distillation_loss": 2.9028866291046143, + "epoch": 4.53, + "learning_rate": 3.0379449610218842e-05, + "loss": 109.3587, + "step": 5361, + "task_loss": 1.1392909288406372 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.996185302734375, + "compression/movement_sparsity/importance_threshold": -2.6717128065456563e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9192321990301932, + "compression/movement_sparsity/model_sparsity": 0.8876537451044944, + "compression_loss": 105.42526245117188, + "distillation_loss": 4.899295330047607, + "epoch": 4.53, + "learning_rate": 3.037475345167653e-05, + "loss": 110.1914, + "step": 5362, + "task_loss": 2.608349084854126 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9962058855887823, + "compression/movement_sparsity/importance_threshold": -2.6572971211357745e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.91923675406223, + "compression/movement_sparsity/model_sparsity": 0.8876581436571679, + "compression_loss": 105.42680358886719, + "distillation_loss": 4.600330352783203, + "epoch": 4.53, + "learning_rate": 3.0370057293134218e-05, + "loss": 110.3389, + "step": 5363, + "task_loss": 2.151409387588501 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9962263942708216, + "compression/movement_sparsity/importance_threshold": -2.6429333840854823e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9193021223492098, + "compression/movement_sparsity/model_sparsity": 0.8877212663423936, + "compression_loss": 105.42825317382812, + "distillation_loss": 5.096721649169922, + "epoch": 4.53, + "learning_rate": 3.0365361134591908e-05, + "loss": 109.6241, + "step": 5364, + "task_loss": 2.582681894302368 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9962468289143782, + "compression/movement_sparsity/importance_threshold": -2.6286215016251696e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9193233235192664, + "compression/movement_sparsity/model_sparsity": 0.8877417391870362, + "compression_loss": 105.42979431152344, + "distillation_loss": 3.665910243988037, + "epoch": 4.53, + "learning_rate": 3.036066497604959e-05, + "loss": 108.9696, + "step": 5365, + "task_loss": 2.107466459274292 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9962671896533372, + "compression/movement_sparsity/importance_threshold": -2.6143613799852264e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9193093841673001, + "compression/movement_sparsity/model_sparsity": 0.8877282786946924, + "compression_loss": 105.43128967285156, + "distillation_loss": 5.877031326293945, + "epoch": 4.54, + "learning_rate": 3.035596881750728e-05, + "loss": 109.7657, + "step": 5366, + "task_loss": 3.0740747451782227 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9962874766215837, + "compression/movement_sparsity/importance_threshold": -2.6001529253960425e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9192995348048328, + "compression/movement_sparsity/model_sparsity": 0.8877187676881261, + "compression_loss": 105.43279266357422, + "distillation_loss": 3.9676191806793213, + "epoch": 4.54, + "learning_rate": 3.035127265896497e-05, + "loss": 110.3255, + "step": 5367, + "task_loss": 1.950934886932373 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9963076899530029, + "compression/movement_sparsity/importance_threshold": -2.585996044087921e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9192658132587587, + "compression/movement_sparsity/model_sparsity": 0.8876862045808992, + "compression_loss": 105.43434143066406, + "distillation_loss": 5.508448600769043, + "epoch": 4.54, + "learning_rate": 3.0346576500422657e-05, + "loss": 109.9107, + "step": 5368, + "task_loss": 3.0187273025512695 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9963278297814798, + "compression/movement_sparsity/importance_threshold": -2.571890642291512e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9193044833344017, + "compression/movement_sparsity/model_sparsity": 0.8877235462204809, + "compression_loss": 105.4358901977539, + "distillation_loss": 3.4196581840515137, + "epoch": 4.54, + "learning_rate": 3.034188034188034e-05, + "loss": 109.4535, + "step": 5369, + "task_loss": 1.3253545761108398 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9963478962408997, + "compression/movement_sparsity/importance_threshold": -2.5578366262370318e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9193582851787747, + "compression/movement_sparsity/model_sparsity": 0.8877754998059858, + "compression_loss": 105.4374008178711, + "distillation_loss": 4.777370929718018, + "epoch": 4.54, + "learning_rate": 3.033718418333803e-05, + "loss": 110.0021, + "step": 5370, + "task_loss": 2.296750068664551 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9963678894651476, + "compression/movement_sparsity/importance_threshold": -2.5438339021548705e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9193870820436154, + "compression/movement_sparsity/model_sparsity": 0.8878033074099296, + "compression_loss": 105.43897247314453, + "distillation_loss": 3.4624719619750977, + "epoch": 4.54, + "learning_rate": 3.033248802479572e-05, + "loss": 109.4799, + "step": 5371, + "task_loss": 2.8789312839508057 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9963878095881089, + "compression/movement_sparsity/importance_threshold": -2.5298823762754177e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9193914462889701, + "compression/movement_sparsity/model_sparsity": 0.8878075217300304, + "compression_loss": 105.4404525756836, + "distillation_loss": 4.339964866638184, + "epoch": 4.54, + "learning_rate": 3.032779186625341e-05, + "loss": 110.1581, + "step": 5372, + "task_loss": 2.1647720336914062 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9964076567436684, + "compression/movement_sparsity/importance_threshold": -2.5159819548293237e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9193886083370727, + "compression/movement_sparsity/model_sparsity": 0.8878047812705113, + "compression_loss": 105.44197082519531, + "distillation_loss": 3.2163329124450684, + "epoch": 4.54, + "learning_rate": 3.0323095707711092e-05, + "loss": 109.3896, + "step": 5373, + "task_loss": 2.8929033279418945 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9964274310657115, + "compression/movement_sparsity/importance_threshold": -2.5021325440465446e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.919425406318397, + "compression/movement_sparsity/model_sparsity": 0.8878403151279733, + "compression_loss": 105.4434585571289, + "distillation_loss": 4.213918685913086, + "epoch": 4.54, + "learning_rate": 3.031839954916878e-05, + "loss": 109.7717, + "step": 5374, + "task_loss": 2.3298094272613525 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9964471326881232, + "compression/movement_sparsity/importance_threshold": -2.488334050157817e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9194712666671245, + "compression/movement_sparsity/model_sparsity": 0.8878846000326391, + "compression_loss": 105.4449462890625, + "distillation_loss": 5.185598850250244, + "epoch": 4.54, + "learning_rate": 3.0313703390626468e-05, + "loss": 110.1756, + "step": 5375, + "task_loss": 2.385098457336426 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9964667617447887, + "compression/movement_sparsity/importance_threshold": -2.4745863793934446e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9194594498169975, + "compression/movement_sparsity/model_sparsity": 0.8878731891276667, + "compression_loss": 105.44644165039062, + "distillation_loss": 5.780341625213623, + "epoch": 4.54, + "learning_rate": 3.0309007232084158e-05, + "loss": 109.8744, + "step": 5376, + "task_loss": 3.3369007110595703 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9964863183695931, + "compression/movement_sparsity/importance_threshold": -2.4608894379838167e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9194840732231655, + "compression/movement_sparsity/model_sparsity": 0.8878969666440825, + "compression_loss": 105.4478530883789, + "distillation_loss": 3.264833450317383, + "epoch": 4.54, + "learning_rate": 3.0304311073541847e-05, + "loss": 108.6197, + "step": 5377, + "task_loss": 1.8578234910964966 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9965058026964216, + "compression/movement_sparsity/importance_threshold": -2.4472431321593234e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9195422750853961, + "compression/movement_sparsity/model_sparsity": 0.8879531690932956, + "compression_loss": 105.44937896728516, + "distillation_loss": 2.9610748291015625, + "epoch": 4.55, + "learning_rate": 3.029961491499953e-05, + "loss": 108.8124, + "step": 5378, + "task_loss": 2.2311654090881348 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9965252148591592, + "compression/movement_sparsity/importance_threshold": -2.4336473681504413e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9195850232263707, + "compression/movement_sparsity/model_sparsity": 0.887994448704119, + "compression_loss": 105.45084381103516, + "distillation_loss": 4.63469123840332, + "epoch": 4.55, + "learning_rate": 3.029491875645722e-05, + "loss": 109.8691, + "step": 5379, + "task_loss": 3.93216609954834 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9965445549916914, + "compression/movement_sparsity/importance_threshold": -2.4201020521873e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.919596279640619, + "compression/movement_sparsity/model_sparsity": 0.888005318425909, + "compression_loss": 105.45220947265625, + "distillation_loss": 4.731152534484863, + "epoch": 4.55, + "learning_rate": 3.0290222597914906e-05, + "loss": 109.0632, + "step": 5380, + "task_loss": 2.890450954437256 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9965638232279028, + "compression/movement_sparsity/importance_threshold": -2.406607090500723e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9196367383414075, + "compression/movement_sparsity/model_sparsity": 0.88804438724586, + "compression_loss": 105.45372009277344, + "distillation_loss": 3.114208698272705, + "epoch": 4.55, + "learning_rate": 3.0285526439372596e-05, + "loss": 109.7862, + "step": 5381, + "task_loss": 2.007270574569702 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9965830197016791, + "compression/movement_sparsity/importance_threshold": -2.3931623893206667e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9196493302624309, + "compression/movement_sparsity/model_sparsity": 0.888056546595659, + "compression_loss": 105.45516204833984, + "distillation_loss": 4.088094711303711, + "epoch": 4.55, + "learning_rate": 3.028083028083028e-05, + "loss": 109.4194, + "step": 5382, + "task_loss": 1.8571776151657104 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9966021445469049, + "compression/movement_sparsity/importance_threshold": -2.3797678548779545e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9197189554772567, + "compression/movement_sparsity/model_sparsity": 0.8881237799701633, + "compression_loss": 105.45657348632812, + "distillation_loss": 4.378728866577148, + "epoch": 4.55, + "learning_rate": 3.027613412228797e-05, + "loss": 109.4723, + "step": 5383, + "task_loss": 3.2587201595306396 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9966211978974657, + "compression/movement_sparsity/importance_threshold": -2.3664233934026294e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9197008307424502, + "compression/movement_sparsity/model_sparsity": 0.8881062778757556, + "compression_loss": 105.4579849243164, + "distillation_loss": 3.4675512313842773, + "epoch": 4.55, + "learning_rate": 3.027143796374566e-05, + "loss": 109.6793, + "step": 5384, + "task_loss": 1.9856019020080566 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9966401798872466, + "compression/movement_sparsity/importance_threshold": -2.3531289111253413e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.919753165914204, + "compression/movement_sparsity/model_sparsity": 0.8881568151733579, + "compression_loss": 105.4593505859375, + "distillation_loss": 2.7578558921813965, + "epoch": 4.55, + "learning_rate": 3.0266741805203348e-05, + "loss": 109.4444, + "step": 5385, + "task_loss": 1.8978077173233032 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9966590906501327, + "compression/movement_sparsity/importance_threshold": -2.3398843142763068e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9197703963364379, + "compression/movement_sparsity/model_sparsity": 0.8881734536775809, + "compression_loss": 105.46074676513672, + "distillation_loss": 3.9399633407592773, + "epoch": 4.55, + "learning_rate": 3.026204564666103e-05, + "loss": 109.8548, + "step": 5386, + "task_loss": 3.3344199657440186 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9966779303200091, + "compression/movement_sparsity/importance_threshold": -2.3266895090860024e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9197591637705248, + "compression/movement_sparsity/model_sparsity": 0.8881626069848625, + "compression_loss": 105.4620590209961, + "distillation_loss": 3.4882378578186035, + "epoch": 4.55, + "learning_rate": 3.0257349488118718e-05, + "loss": 110.0376, + "step": 5387, + "task_loss": 1.383824110031128 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.996696699030761, + "compression/movement_sparsity/importance_threshold": -2.313544401784818e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9197608331539938, + "compression/movement_sparsity/model_sparsity": 0.8881642190198737, + "compression_loss": 105.46339416503906, + "distillation_loss": 3.0484070777893066, + "epoch": 4.55, + "learning_rate": 3.0252653329576407e-05, + "loss": 109.0192, + "step": 5388, + "task_loss": 1.2953248023986816 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9967153969162734, + "compression/movement_sparsity/importance_threshold": -2.30044889860323e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9197296156831232, + "compression/movement_sparsity/model_sparsity": 0.8881340739651636, + "compression_loss": 105.46469116210938, + "distillation_loss": 5.782516002655029, + "epoch": 4.56, + "learning_rate": 3.0247957171034097e-05, + "loss": 109.5009, + "step": 5389, + "task_loss": 3.402012348175049 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9967340241104317, + "compression/movement_sparsity/importance_threshold": -2.2874029057714552e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9197008188182826, + "compression/movement_sparsity/model_sparsity": 0.8881062663612198, + "compression_loss": 105.4659423828125, + "distillation_loss": 4.935763835906982, + "epoch": 4.56, + "learning_rate": 3.0243261012491787e-05, + "loss": 109.8576, + "step": 5390, + "task_loss": 4.032351016998291 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9967525807471208, + "compression/movement_sparsity/importance_threshold": -2.2744063295200567e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9197104535457323, + "compression/movement_sparsity/model_sparsity": 0.8881155701061418, + "compression_loss": 105.46715545654297, + "distillation_loss": 5.347166061401367, + "epoch": 4.56, + "learning_rate": 3.023856485394947e-05, + "loss": 110.3588, + "step": 5391, + "task_loss": 2.7985904216766357 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.996771066960226, + "compression/movement_sparsity/importance_threshold": -2.2614590760793377e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9197123733367217, + "compression/movement_sparsity/model_sparsity": 0.8881174239464047, + "compression_loss": 105.46841430664062, + "distillation_loss": 3.6117000579833984, + "epoch": 4.56, + "learning_rate": 3.023386869540716e-05, + "loss": 109.2145, + "step": 5392, + "task_loss": 2.4229769706726074 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9967894828836323, + "compression/movement_sparsity/importance_threshold": -2.248561051679688e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9196755753553975, + "compression/movement_sparsity/model_sparsity": 0.8880818900889428, + "compression_loss": 105.46964263916016, + "distillation_loss": 3.3416757583618164, + "epoch": 4.56, + "learning_rate": 3.0229172536864846e-05, + "loss": 108.7987, + "step": 5393, + "task_loss": 1.990061640739441 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.996807828651225, + "compression/movement_sparsity/importance_threshold": -2.235712162551671e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9197323701658471, + "compression/movement_sparsity/model_sparsity": 0.8881367338229321, + "compression_loss": 105.47087860107422, + "distillation_loss": 3.136685848236084, + "epoch": 4.56, + "learning_rate": 3.0224476378322535e-05, + "loss": 109.2282, + "step": 5394, + "task_loss": 1.9621330499649048 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9968261043968891, + "compression/movement_sparsity/importance_threshold": -2.222912314925416e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9197631106700123, + "compression/movement_sparsity/model_sparsity": 0.8881664182962105, + "compression_loss": 105.47207641601562, + "distillation_loss": 5.968227863311768, + "epoch": 4.56, + "learning_rate": 3.021978021978022e-05, + "loss": 109.9436, + "step": 5395, + "task_loss": 3.492405652999878 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9968443102545098, + "compression/movement_sparsity/importance_threshold": -2.2101614150315738e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9198399857787607, + "compression/movement_sparsity/model_sparsity": 0.888240652508478, + "compression_loss": 105.47335815429688, + "distillation_loss": 4.417186737060547, + "epoch": 4.56, + "learning_rate": 3.0215084061237908e-05, + "loss": 109.2984, + "step": 5396, + "task_loss": 2.488494396209717 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9968624463579723, + "compression/movement_sparsity/importance_threshold": -2.197459369100447e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.919890079206999, + "compression/movement_sparsity/model_sparsity": 0.8882890250733507, + "compression_loss": 105.47454071044922, + "distillation_loss": 3.561026096343994, + "epoch": 4.56, + "learning_rate": 3.0210387902695598e-05, + "loss": 109.5969, + "step": 5397, + "task_loss": 1.6686204671859741 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9968805128411616, + "compression/movement_sparsity/importance_threshold": -2.1848060833623388e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.919880778356243, + "compression/movement_sparsity/model_sparsity": 0.888280043735431, + "compression_loss": 105.47574615478516, + "distillation_loss": 3.9778554439544678, + "epoch": 4.56, + "learning_rate": 3.0205691744153288e-05, + "loss": 109.305, + "step": 5398, + "task_loss": 1.7907391786575317 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9968985098379629, + "compression/movement_sparsity/importance_threshold": -2.1722014640478127e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9197684765454485, + "compression/movement_sparsity/model_sparsity": 0.8881715998373181, + "compression_loss": 105.47701263427734, + "distillation_loss": 4.297873497009277, + "epoch": 4.56, + "learning_rate": 3.020099558561097e-05, + "loss": 109.7092, + "step": 5399, + "task_loss": 2.272947311401367 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9969164374822614, + "compression/movement_sparsity/importance_threshold": -2.1596454173871717e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9197937677050041, + "compression/movement_sparsity/model_sparsity": 0.8881960221677383, + "compression_loss": 105.47819519042969, + "distillation_loss": 3.4612648487091064, + "epoch": 4.56, + "learning_rate": 3.0196299427068657e-05, + "loss": 109.5361, + "step": 5400, + "task_loss": 1.7764991521835327 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9969342959079422, + "compression/movement_sparsity/importance_threshold": -2.1471378496108057e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9198075401186235, + "compression/movement_sparsity/model_sparsity": 0.8882093214565809, + "compression_loss": 105.47940826416016, + "distillation_loss": 4.036114692687988, + "epoch": 4.57, + "learning_rate": 3.0191603268526347e-05, + "loss": 109.9489, + "step": 5401, + "task_loss": 1.870722770690918 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9969520852488906, + "compression/movement_sparsity/importance_threshold": -2.1346786669490178e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.919891879756312, + "compression/movement_sparsity/model_sparsity": 0.8882907637682558, + "compression_loss": 105.48066711425781, + "distillation_loss": 6.202736854553223, + "epoch": 4.57, + "learning_rate": 3.0186907109984036e-05, + "loss": 109.6531, + "step": 5402, + "task_loss": 2.6868467330932617 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9969698056389914, + "compression/movement_sparsity/importance_threshold": -2.122267775632458e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9199438333547014, + "compression/movement_sparsity/model_sparsity": 0.8883409326007126, + "compression_loss": 105.48190307617188, + "distillation_loss": 4.627185821533203, + "epoch": 4.57, + "learning_rate": 3.018221095144172e-05, + "loss": 109.6802, + "step": 5403, + "task_loss": 2.583956480026245 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9969874572121299, + "compression/movement_sparsity/importance_threshold": -2.109905081891343e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9199580708108587, + "compression/movement_sparsity/model_sparsity": 0.8883546809564512, + "compression_loss": 105.48314666748047, + "distillation_loss": 3.526808500289917, + "epoch": 4.57, + "learning_rate": 3.017751479289941e-05, + "loss": 109.0925, + "step": 5404, + "task_loss": 1.9932243824005127 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9970050401021913, + "compression/movement_sparsity/importance_threshold": -2.0975904919561493e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9199677055383084, + "compression/movement_sparsity/model_sparsity": 0.8883639847013732, + "compression_loss": 105.484375, + "distillation_loss": 4.282423973083496, + "epoch": 4.57, + "learning_rate": 3.01728186343571e-05, + "loss": 109.9214, + "step": 5405, + "task_loss": 3.657771348953247 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9970225544430608, + "compression/movement_sparsity/importance_threshold": -2.0853239120571798e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9199280457567516, + "compression/movement_sparsity/model_sparsity": 0.8883256873553206, + "compression_loss": 105.48556518554688, + "distillation_loss": 4.78279972076416, + "epoch": 4.57, + "learning_rate": 3.0168122475814785e-05, + "loss": 109.618, + "step": 5406, + "task_loss": 2.6205201148986816 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9970400003686233, + "compression/movement_sparsity/importance_threshold": -2.073105248424998e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9199251720323514, + "compression/movement_sparsity/model_sparsity": 0.8883229123521941, + "compression_loss": 105.48680877685547, + "distillation_loss": 3.8371617794036865, + "epoch": 4.57, + "learning_rate": 3.0163426317272475e-05, + "loss": 109.7411, + "step": 5407, + "task_loss": 2.1720006465911865 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9970573780127642, + "compression/movement_sparsity/importance_threshold": -2.0609344072897336e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9199604794927211, + "compression/movement_sparsity/model_sparsity": 0.8883570068926817, + "compression_loss": 105.48800659179688, + "distillation_loss": 4.432809829711914, + "epoch": 4.57, + "learning_rate": 3.0158730158730158e-05, + "loss": 109.0317, + "step": 5408, + "task_loss": 2.487034320831299 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9970746875093683, + "compression/movement_sparsity/importance_threshold": -2.0488112948821233e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9199695299359567, + "compression/movement_sparsity/model_sparsity": 0.8883657464253498, + "compression_loss": 105.4891357421875, + "distillation_loss": 3.049424409866333, + "epoch": 4.57, + "learning_rate": 3.0154034000187847e-05, + "loss": 109.3342, + "step": 5409, + "task_loss": 1.546929121017456 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9970919289923212, + "compression/movement_sparsity/importance_threshold": -2.0367358174322968e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.919990969589366, + "compression/movement_sparsity/model_sparsity": 0.8883864495607083, + "compression_loss": 105.49037170410156, + "distillation_loss": 3.317962646484375, + "epoch": 4.57, + "learning_rate": 3.0149337841645537e-05, + "loss": 109.5388, + "step": 5410, + "task_loss": 0.9532434344291687 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9971091025955078, + "compression/movement_sparsity/importance_threshold": -2.0247078811707307e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9200005804684805, + "compression/movement_sparsity/model_sparsity": 0.8883957302765587, + "compression_loss": 105.49156951904297, + "distillation_loss": 4.547045707702637, + "epoch": 4.57, + "learning_rate": 3.0144641683103224e-05, + "loss": 109.2445, + "step": 5411, + "task_loss": 2.634608268737793 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9971262084528132, + "compression/movement_sparsity/importance_threshold": -2.0127273923279017e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9200108948734855, + "compression/movement_sparsity/model_sparsity": 0.888405690350021, + "compression_loss": 105.49274444580078, + "distillation_loss": 3.726978063583374, + "epoch": 4.57, + "learning_rate": 3.013994552456091e-05, + "loss": 108.882, + "step": 5412, + "task_loss": 2.5920374393463135 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9971432466981225, + "compression/movement_sparsity/importance_threshold": -2.0007942571341128e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9199282842401043, + "compression/movement_sparsity/model_sparsity": 0.8883259176460365, + "compression_loss": 105.49391174316406, + "distillation_loss": 4.775929927825928, + "epoch": 4.58, + "learning_rate": 3.0135249366018596e-05, + "loss": 110.1901, + "step": 5413, + "task_loss": 2.1523897647857666 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.997160217465321, + "compression/movement_sparsity/importance_threshold": -1.988908381819754e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9199744546171903, + "compression/movement_sparsity/model_sparsity": 0.888370501928633, + "compression_loss": 105.4951171875, + "distillation_loss": 5.30881404876709, + "epoch": 4.58, + "learning_rate": 3.0130553207476286e-05, + "loss": 109.8048, + "step": 5414, + "task_loss": 3.0027213096618652 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9971771208882938, + "compression/movement_sparsity/importance_threshold": -1.9770696726153017e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9199871896282255, + "compression/movement_sparsity/model_sparsity": 0.8883827994528615, + "compression_loss": 105.49625396728516, + "distillation_loss": 3.2653913497924805, + "epoch": 4.58, + "learning_rate": 3.0125857048933976e-05, + "loss": 109.4935, + "step": 5415, + "task_loss": 1.4934067726135254 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.997193957100926, + "compression/movement_sparsity/importance_threshold": -1.965278035751146e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9200267182439383, + "compression/movement_sparsity/model_sparsity": 0.8884209701390204, + "compression_loss": 105.49735260009766, + "distillation_loss": 3.640929937362671, + "epoch": 4.58, + "learning_rate": 3.012116089039166e-05, + "loss": 109.3166, + "step": 5416, + "task_loss": 1.6433173418045044 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9972107262371027, + "compression/movement_sparsity/importance_threshold": -1.953533377457503e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9200069837465009, + "compression/movement_sparsity/model_sparsity": 0.8884019135822805, + "compression_loss": 105.49849700927734, + "distillation_loss": 4.611537933349609, + "epoch": 4.58, + "learning_rate": 3.011646473184935e-05, + "loss": 110.2838, + "step": 5417, + "task_loss": 3.0056958198547363 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9972274284307091, + "compression/movement_sparsity/importance_threshold": -1.94183560396511e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9200309036267785, + "compression/movement_sparsity/model_sparsity": 0.8884250117410842, + "compression_loss": 105.49968719482422, + "distillation_loss": 3.729404926300049, + "epoch": 4.58, + "learning_rate": 3.0111768573307038e-05, + "loss": 109.3573, + "step": 5418, + "task_loss": 2.040515661239624 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9972440638156304, + "compression/movement_sparsity/importance_threshold": -1.9301846215040962e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9200482652148563, + "compression/movement_sparsity/model_sparsity": 0.8884417769052011, + "compression_loss": 105.50084686279297, + "distillation_loss": 5.651251792907715, + "epoch": 4.58, + "learning_rate": 3.0107072414764724e-05, + "loss": 109.7857, + "step": 5419, + "task_loss": 4.400290012359619 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9972606325257516, + "compression/movement_sparsity/importance_threshold": -1.9185803363048516e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9200630631068925, + "compression/movement_sparsity/model_sparsity": 0.8884560664441221, + "compression_loss": 105.50201416015625, + "distillation_loss": 4.091880798339844, + "epoch": 4.58, + "learning_rate": 3.0102376256222407e-05, + "loss": 109.3682, + "step": 5420, + "task_loss": 2.8512072563171387 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9972771346949579, + "compression/movement_sparsity/importance_threshold": -1.9070226545979396e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9200387020324123, + "compression/movement_sparsity/model_sparsity": 0.8884325422474939, + "compression_loss": 105.503173828125, + "distillation_loss": 4.016436576843262, + "epoch": 4.58, + "learning_rate": 3.0097680097680097e-05, + "loss": 109.6879, + "step": 5421, + "task_loss": 2.579033374786377 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9972935704571345, + "compression/movement_sparsity/importance_threshold": -1.8955114826136633e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9200772528663791, + "compression/movement_sparsity/model_sparsity": 0.8884697687417177, + "compression_loss": 105.50439453125, + "distillation_loss": 6.383502960205078, + "epoch": 4.58, + "learning_rate": 3.0092983939137787e-05, + "loss": 109.6733, + "step": 5422, + "task_loss": 2.866666555404663 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9973099399461665, + "compression/movement_sparsity/importance_threshold": -1.8840467265824994e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9200537264836336, + "compression/movement_sparsity/model_sparsity": 0.888447050562595, + "compression_loss": 105.50558471679688, + "distillation_loss": 4.0242533683776855, + "epoch": 4.58, + "learning_rate": 3.0088287780595477e-05, + "loss": 110.1094, + "step": 5423, + "task_loss": 1.5807554721832275 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.997326243295939, + "compression/movement_sparsity/importance_threshold": -1.872628292734664e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9200336819578376, + "compression/movement_sparsity/model_sparsity": 0.8884276946279244, + "compression_loss": 105.50672912597656, + "distillation_loss": 6.1720781326293945, + "epoch": 4.58, + "learning_rate": 3.0083591622053163e-05, + "loss": 110.3509, + "step": 5424, + "task_loss": 3.150758743286133 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9973424806403371, + "compression/movement_sparsity/importance_threshold": -1.8612560873008076e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9200354228863125, + "compression/movement_sparsity/model_sparsity": 0.8884293757501505, + "compression_loss": 105.50785064697266, + "distillation_loss": 3.480012893676758, + "epoch": 4.59, + "learning_rate": 3.007889546351085e-05, + "loss": 109.1464, + "step": 5425, + "task_loss": 1.575105905532837 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9973586521132461, + "compression/movement_sparsity/importance_threshold": -1.8499300165111462e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.920060392093342, + "compression/movement_sparsity/model_sparsity": 0.8884534871881042, + "compression_loss": 105.50898742675781, + "distillation_loss": 4.788480758666992, + "epoch": 4.59, + "learning_rate": 3.0074199304968536e-05, + "loss": 109.9181, + "step": 5426, + "task_loss": 2.152116060256958 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9973747578485511, + "compression/movement_sparsity/importance_threshold": -1.83864998659607e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9201151001744553, + "compression/movement_sparsity/model_sparsity": 0.8885063158783295, + "compression_loss": 105.51004791259766, + "distillation_loss": 5.405692100524902, + "epoch": 4.59, + "learning_rate": 3.0069503146426225e-05, + "loss": 110.1407, + "step": 5427, + "task_loss": 2.615222692489624 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9973907979801372, + "compression/movement_sparsity/importance_threshold": -1.8274159037860553e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9201977942770101, + "compression/movement_sparsity/model_sparsity": 0.8885861691840646, + "compression_loss": 105.51116180419922, + "distillation_loss": 4.726553916931152, + "epoch": 4.59, + "learning_rate": 3.0064806987883915e-05, + "loss": 109.2017, + "step": 5428, + "task_loss": 2.1050519943237305 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9974067726418895, + "compression/movement_sparsity/importance_threshold": -1.8162276743114922e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9203000440144876, + "compression/movement_sparsity/model_sparsity": 0.8886849063285028, + "compression_loss": 105.51224517822266, + "distillation_loss": 4.408090591430664, + "epoch": 4.59, + "learning_rate": 3.0060110829341598e-05, + "loss": 109.7134, + "step": 5429, + "task_loss": 2.938549757003784 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9974226819676932, + "compression/movement_sparsity/importance_threshold": -1.8050852044026837e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9202714260121615, + "compression/movement_sparsity/model_sparsity": 0.888657271442596, + "compression_loss": 105.5133056640625, + "distillation_loss": 3.5833630561828613, + "epoch": 4.59, + "learning_rate": 3.0055414670799288e-05, + "loss": 109.8828, + "step": 5430, + "task_loss": 2.7433745861053467 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9974385260914334, + "compression/movement_sparsity/importance_threshold": -1.7939884002901932e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9202721653105549, + "compression/movement_sparsity/model_sparsity": 0.8886579853438152, + "compression_loss": 105.51435089111328, + "distillation_loss": 5.136394500732422, + "epoch": 4.59, + "learning_rate": 3.0050718512256974e-05, + "loss": 109.936, + "step": 5431, + "task_loss": 1.9441114664077759 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9974543051469952, + "compression/movement_sparsity/importance_threshold": -1.7829371682043238e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9202688146194492, + "compression/movement_sparsity/model_sparsity": 0.888654749759257, + "compression_loss": 105.51541137695312, + "distillation_loss": 5.497203826904297, + "epoch": 4.59, + "learning_rate": 3.0046022353714664e-05, + "loss": 110.1104, + "step": 5432, + "task_loss": 3.15281343460083 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9974700192682638, + "compression/movement_sparsity/importance_threshold": -1.7719314143754654e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9202381456602898, + "compression/movement_sparsity/model_sparsity": 0.8886251343731935, + "compression_loss": 105.5165023803711, + "distillation_loss": 5.259067058563232, + "epoch": 4.59, + "learning_rate": 3.0041326195172347e-05, + "loss": 109.7755, + "step": 5433, + "task_loss": 3.562157154083252 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9974856685891245, + "compression/movement_sparsity/importance_threshold": -1.760971045034008e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9202509641404983, + "compression/movement_sparsity/model_sparsity": 0.8886375124991726, + "compression_loss": 105.51759338378906, + "distillation_loss": 4.971002578735352, + "epoch": 4.59, + "learning_rate": 3.0036630036630036e-05, + "loss": 110.0231, + "step": 5434, + "task_loss": 1.891106128692627 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9975012532434622, + "compression/movement_sparsity/importance_threshold": -1.7500559664104277e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9202753490633137, + "compression/movement_sparsity/model_sparsity": 0.8886610597248724, + "compression_loss": 105.51864624023438, + "distillation_loss": 5.3290557861328125, + "epoch": 4.59, + "learning_rate": 3.0031933878087726e-05, + "loss": 110.5705, + "step": 5435, + "task_loss": 3.0382227897644043 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9975167733651621, + "compression/movement_sparsity/importance_threshold": -1.7391860847350282e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.920275205973302, + "compression/movement_sparsity/model_sparsity": 0.8886609215504429, + "compression_loss": 105.51971435546875, + "distillation_loss": 4.268392562866211, + "epoch": 4.59, + "learning_rate": 3.0027237719545416e-05, + "loss": 109.5617, + "step": 5436, + "task_loss": 2.192270040512085 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9975322290881093, + "compression/movement_sparsity/importance_threshold": -1.728361306238286e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9202429987965176, + "compression/movement_sparsity/model_sparsity": 0.8886298207892618, + "compression_loss": 105.52078247070312, + "distillation_loss": 4.538769721984863, + "epoch": 4.6, + "learning_rate": 3.0022541561003102e-05, + "loss": 109.4774, + "step": 5437, + "task_loss": 3.0774691104888916 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9975476205461891, + "compression/movement_sparsity/importance_threshold": -1.717581537150504e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9202198182146335, + "compression/movement_sparsity/model_sparsity": 0.8886074365316772, + "compression_loss": 105.52180480957031, + "distillation_loss": 4.1216230392456055, + "epoch": 4.6, + "learning_rate": 3.001784540246079e-05, + "loss": 109.2754, + "step": 5438, + "task_loss": 3.4140119552612305 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9975629478732865, + "compression/movement_sparsity/importance_threshold": -1.706846683702159e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9201922256907241, + "compression/movement_sparsity/model_sparsity": 0.8885807918958486, + "compression_loss": 105.52288818359375, + "distillation_loss": 3.224083423614502, + "epoch": 4.6, + "learning_rate": 3.0013149243918475e-05, + "loss": 109.5837, + "step": 5439, + "task_loss": 1.856462836265564 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9975782112032866, + "compression/movement_sparsity/importance_threshold": -1.6961566521237273e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9202125086998727, + "compression/movement_sparsity/model_sparsity": 0.8886003781212352, + "compression_loss": 105.5239028930664, + "distillation_loss": 5.161131858825684, + "epoch": 4.6, + "learning_rate": 3.0008453085376165e-05, + "loss": 109.7338, + "step": 5440, + "task_loss": 3.1608545780181885 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9975934106700748, + "compression/movement_sparsity/importance_threshold": -1.685511348645339e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9202338291116057, + "compression/movement_sparsity/model_sparsity": 0.8886209661112358, + "compression_loss": 105.52497100830078, + "distillation_loss": 3.9445438385009766, + "epoch": 4.6, + "learning_rate": 3.0003756926833854e-05, + "loss": 109.463, + "step": 5441, + "task_loss": 3.3866138458251953 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9976085464075359, + "compression/movement_sparsity/importance_threshold": -1.674910679497644e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9202292025345629, + "compression/movement_sparsity/model_sparsity": 0.8886164984713475, + "compression_loss": 105.52599334716797, + "distillation_loss": 5.2165207862854, + "epoch": 4.6, + "learning_rate": 2.9999060768291537e-05, + "loss": 109.5505, + "step": 5442, + "task_loss": 2.4473001956939697 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9976236185495553, + "compression/movement_sparsity/importance_threshold": -1.664354550911032e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.920282837440589, + "compression/movement_sparsity/model_sparsity": 0.8886682908533513, + "compression_loss": 105.52703094482422, + "distillation_loss": 4.258654594421387, + "epoch": 4.6, + "learning_rate": 2.9994364609749227e-05, + "loss": 109.1194, + "step": 5443, + "task_loss": 1.7345026731491089 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9976386272300181, + "compression/movement_sparsity/importance_threshold": -1.6538428691156327e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9202937719023111, + "compression/movement_sparsity/model_sparsity": 0.888678849682675, + "compression_loss": 105.52809143066406, + "distillation_loss": 4.340701580047607, + "epoch": 4.6, + "learning_rate": 2.9989668451206913e-05, + "loss": 109.8629, + "step": 5444, + "task_loss": 2.546527862548828 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9976535725828093, + "compression/movement_sparsity/importance_threshold": -1.6433755403421832e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9202164794476955, + "compression/movement_sparsity/model_sparsity": 0.8886042124616548, + "compression_loss": 105.52909088134766, + "distillation_loss": 3.998823881149292, + "epoch": 4.6, + "learning_rate": 2.9984972292664603e-05, + "loss": 109.7423, + "step": 5445, + "task_loss": 1.9490429162979126 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9976684547418141, + "compression/movement_sparsity/importance_threshold": -1.6329524708208996e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9202383960678102, + "compression/movement_sparsity/model_sparsity": 0.8886253761784451, + "compression_loss": 105.53009033203125, + "distillation_loss": 4.013250350952148, + "epoch": 4.6, + "learning_rate": 2.9980276134122286e-05, + "loss": 109.4206, + "step": 5446, + "task_loss": 3.065840244293213 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9976832738409177, + "compression/movement_sparsity/importance_threshold": -1.622573566782172e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9202834574973061, + "compression/movement_sparsity/model_sparsity": 0.8886688896092126, + "compression_loss": 105.53105163574219, + "distillation_loss": 3.6554760932922363, + "epoch": 4.6, + "learning_rate": 2.9975579975579976e-05, + "loss": 109.505, + "step": 5447, + "task_loss": 2.0333545207977295 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9976980300140053, + "compression/movement_sparsity/importance_threshold": -1.6122387344564768e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9202830759239418, + "compression/movement_sparsity/model_sparsity": 0.8886685211440672, + "compression_loss": 105.53204345703125, + "distillation_loss": 3.490699052810669, + "epoch": 4.6, + "learning_rate": 2.9970883817037666e-05, + "loss": 109.3694, + "step": 5448, + "task_loss": 1.384548306465149 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9977127233949618, + "compression/movement_sparsity/importance_threshold": -1.601947880074204e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9203423509612596, + "compression/movement_sparsity/model_sparsity": 0.8887257599015019, + "compression_loss": 105.53298950195312, + "distillation_loss": 4.419117450714111, + "epoch": 4.61, + "learning_rate": 2.9966187658495355e-05, + "loss": 109.4583, + "step": 5449, + "task_loss": 2.294048547744751 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9977273541176725, + "compression/movement_sparsity/importance_threshold": -1.5917009098656565e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9203840855479851, + "compression/movement_sparsity/model_sparsity": 0.8887660607767828, + "compression_loss": 105.53385925292969, + "distillation_loss": 4.980733871459961, + "epoch": 4.61, + "learning_rate": 2.9961491499953038e-05, + "loss": 109.7196, + "step": 5450, + "task_loss": 2.9670205116271973 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9977419223160227, + "compression/movement_sparsity/importance_threshold": -1.581497730061398e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9203977148715929, + "compression/movement_sparsity/model_sparsity": 0.888779221891196, + "compression_loss": 105.53483581542969, + "distillation_loss": 4.469579696655273, + "epoch": 4.61, + "learning_rate": 2.9956795341410725e-05, + "loss": 109.5081, + "step": 5451, + "task_loss": 2.549023151397705 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9977564281238972, + "compression/movement_sparsity/importance_threshold": -1.5713382468916445e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.920440641875082, + "compression/movement_sparsity/model_sparsity": 0.8888206742200563, + "compression_loss": 105.53581237792969, + "distillation_loss": 4.998918533325195, + "epoch": 4.61, + "learning_rate": 2.9952099182868414e-05, + "loss": 110.3311, + "step": 5452, + "task_loss": 2.6924219131469727 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9977708716751814, + "compression/movement_sparsity/importance_threshold": -1.561222366586873e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9204366592030917, + "compression/movement_sparsity/model_sparsity": 0.8888168283651009, + "compression_loss": 105.53672790527344, + "distillation_loss": 4.141615867614746, + "epoch": 4.61, + "learning_rate": 2.9947403024326104e-05, + "loss": 110.0063, + "step": 5453, + "task_loss": 2.9852094650268555 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9977852531037603, + "compression/movement_sparsity/importance_threshold": -1.551149995377473e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9205023852151005, + "compression/movement_sparsity/model_sparsity": 0.8888802964864004, + "compression_loss": 105.53768157958984, + "distillation_loss": 4.295146942138672, + "epoch": 4.61, + "learning_rate": 2.9942706865783794e-05, + "loss": 109.9629, + "step": 5454, + "task_loss": 3.4808645248413086 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9977995725435191, + "compression/movement_sparsity/importance_threshold": -1.5411210394939215e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9205119483975445, + "compression/movement_sparsity/model_sparsity": 0.8888895311441076, + "compression_loss": 105.53864288330078, + "distillation_loss": 5.221095085144043, + "epoch": 4.61, + "learning_rate": 2.9938010707241477e-05, + "loss": 110.4152, + "step": 5455, + "task_loss": 3.114377975463867 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.997813830128343, + "compression/movement_sparsity/importance_threshold": -1.5311354051665213e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9205163007187316, + "compression/movement_sparsity/model_sparsity": 0.8888937339496726, + "compression_loss": 105.53961944580078, + "distillation_loss": 4.532803535461426, + "epoch": 4.61, + "learning_rate": 2.9933314548699166e-05, + "loss": 109.595, + "step": 5456, + "task_loss": 1.9667948484420776 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9978280259921171, + "compression/movement_sparsity/importance_threshold": -1.5211929986256624e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9205378238413143, + "compression/movement_sparsity/model_sparsity": 0.8889145176867818, + "compression_loss": 105.54054260253906, + "distillation_loss": 3.1912975311279297, + "epoch": 4.61, + "learning_rate": 2.9928618390156853e-05, + "loss": 109.2794, + "step": 5457, + "task_loss": 2.0149428844451904 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9978421602687264, + "compression/movement_sparsity/importance_threshold": -1.5112937261018214e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9205403279165179, + "compression/movement_sparsity/model_sparsity": 0.8889169357392986, + "compression_loss": 105.5415267944336, + "distillation_loss": 3.9845850467681885, + "epoch": 4.61, + "learning_rate": 2.9923922231614543e-05, + "loss": 109.2521, + "step": 5458, + "task_loss": 1.4609997272491455 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9978562330920562, + "compression/movement_sparsity/importance_threshold": -1.5014374938253014e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9205084665405948, + "compression/movement_sparsity/model_sparsity": 0.8888861688996557, + "compression_loss": 105.54244232177734, + "distillation_loss": 6.106750011444092, + "epoch": 4.61, + "learning_rate": 2.9919226073072225e-05, + "loss": 110.6262, + "step": 5459, + "task_loss": 4.572825908660889 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9978702445959916, + "compression/movement_sparsity/importance_threshold": -1.491624208026579e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9205084903889301, + "compression/movement_sparsity/model_sparsity": 0.8888861919287272, + "compression_loss": 105.54338073730469, + "distillation_loss": 4.5230889320373535, + "epoch": 4.61, + "learning_rate": 2.9914529914529915e-05, + "loss": 109.3057, + "step": 5460, + "task_loss": 1.55849027633667 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9978841949144177, + "compression/movement_sparsity/importance_threshold": -1.4818537749360441e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9204459362055124, + "compression/movement_sparsity/model_sparsity": 0.8888257866739491, + "compression_loss": 105.54432678222656, + "distillation_loss": 2.8002400398254395, + "epoch": 4.62, + "learning_rate": 2.9909833755987605e-05, + "loss": 108.3748, + "step": 5461, + "task_loss": 1.3882176876068115 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9978980841812197, + "compression/movement_sparsity/importance_threshold": -1.4721261007840866e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9204676858872802, + "compression/movement_sparsity/model_sparsity": 0.8888467891872384, + "compression_loss": 105.54521179199219, + "distillation_loss": 5.3518571853637695, + "epoch": 4.62, + "learning_rate": 2.9905137597445295e-05, + "loss": 110.0048, + "step": 5462, + "task_loss": 3.5659778118133545 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9979119125302828, + "compression/movement_sparsity/importance_threshold": -1.4624410918010096e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9205679085162596, + "compression/movement_sparsity/model_sparsity": 0.8889435688605914, + "compression_loss": 105.54608154296875, + "distillation_loss": 4.561398029327393, + "epoch": 4.62, + "learning_rate": 2.9900441438902978e-05, + "loss": 109.0262, + "step": 5463, + "task_loss": 2.1509876251220703 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.997925680095492, + "compression/movement_sparsity/importance_threshold": -1.4527986542173764e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9206102512355345, + "compression/movement_sparsity/model_sparsity": 0.8889844569771979, + "compression_loss": 105.54691314697266, + "distillation_loss": 4.082301616668701, + "epoch": 4.62, + "learning_rate": 2.9895745280360664e-05, + "loss": 109.388, + "step": 5464, + "task_loss": 2.8342440128326416 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9979393870107324, + "compression/movement_sparsity/importance_threshold": -1.4431986942634902e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9205886923404489, + "compression/movement_sparsity/model_sparsity": 0.8889636386964813, + "compression_loss": 105.5477294921875, + "distillation_loss": 4.635745525360107, + "epoch": 4.62, + "learning_rate": 2.9891049121818354e-05, + "loss": 109.8422, + "step": 5465, + "task_loss": 1.616660714149475 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9979530334098894, + "compression/movement_sparsity/importance_threshold": -1.4336411181697407e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9206219846164883, + "compression/movement_sparsity/model_sparsity": 0.8889957872804197, + "compression_loss": 105.54859161376953, + "distillation_loss": 3.7916207313537598, + "epoch": 4.62, + "learning_rate": 2.9886352963276043e-05, + "loss": 110.2712, + "step": 5466, + "task_loss": 1.5971463918685913 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9979666194268478, + "compression/movement_sparsity/importance_threshold": -1.4241258321666048e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9206097742688292, + "compression/movement_sparsity/model_sparsity": 0.8889839963957661, + "compression_loss": 105.54937744140625, + "distillation_loss": 5.486489772796631, + "epoch": 4.62, + "learning_rate": 2.9881656804733733e-05, + "loss": 110.2393, + "step": 5467, + "task_loss": 3.099483013153076 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.997980145195493, + "compression/movement_sparsity/importance_threshold": -1.4146527424843854e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9206345884616793, + "compression/movement_sparsity/model_sparsity": 0.8890079581447545, + "compression_loss": 105.55021667480469, + "distillation_loss": 4.403571128845215, + "epoch": 4.62, + "learning_rate": 2.9876960646191416e-05, + "loss": 110.0653, + "step": 5468, + "task_loss": 2.2342982292175293 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9979936108497102, + "compression/movement_sparsity/importance_threshold": -1.4052217553534724e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9206632303123407, + "compression/movement_sparsity/model_sparsity": 0.889035616059733, + "compression_loss": 105.55101013183594, + "distillation_loss": 5.160192489624023, + "epoch": 4.62, + "learning_rate": 2.9872264487649106e-05, + "loss": 109.8276, + "step": 5469, + "task_loss": 2.0440902709960938 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9980070165233843, + "compression/movement_sparsity/importance_threshold": -1.3958327770042557e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9206772889059833, + "compression/movement_sparsity/model_sparsity": 0.8890491916974348, + "compression_loss": 105.55180358886719, + "distillation_loss": 3.341071605682373, + "epoch": 4.62, + "learning_rate": 2.9867568329106792e-05, + "loss": 109.2989, + "step": 5470, + "task_loss": 2.2177059650421143 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9980203623504005, + "compression/movement_sparsity/importance_threshold": -1.3864857136673854e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9207051199132454, + "compression/movement_sparsity/model_sparsity": 0.8890760666239792, + "compression_loss": 105.55258178710938, + "distillation_loss": 4.070933818817139, + "epoch": 4.62, + "learning_rate": 2.9862872170564482e-05, + "loss": 109.2709, + "step": 5471, + "task_loss": 1.638122320175171 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.998033648464644, + "compression/movement_sparsity/importance_threshold": -1.3771804715729044e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.920764538040575, + "compression/movement_sparsity/model_sparsity": 0.8891334435558435, + "compression_loss": 105.5533676147461, + "distillation_loss": 4.104494094848633, + "epoch": 4.63, + "learning_rate": 2.9858176012022165e-05, + "loss": 109.1266, + "step": 5472, + "task_loss": 2.204360246658325 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.998046875, + "compression/movement_sparsity/importance_threshold": -1.367916956951376e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9207605434444169, + "compression/movement_sparsity/model_sparsity": 0.8891295861863522, + "compression_loss": 105.55414581298828, + "distillation_loss": 4.091855525970459, + "epoch": 4.63, + "learning_rate": 2.9853479853479855e-05, + "loss": 109.6253, + "step": 5473, + "task_loss": 3.016900062561035 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9980600420903535, + "compression/movement_sparsity/importance_threshold": -1.358695076033277e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9207917132186171, + "compression/movement_sparsity/model_sparsity": 0.8891596851829192, + "compression_loss": 105.554931640625, + "distillation_loss": 3.9890122413635254, + "epoch": 4.63, + "learning_rate": 2.9848783694937544e-05, + "loss": 110.0982, + "step": 5474, + "task_loss": 2.9276719093322754 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9980731498695897, + "compression/movement_sparsity/importance_threshold": -1.3495147350488235e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9207610084869547, + "compression/movement_sparsity/model_sparsity": 0.8891300352532482, + "compression_loss": 105.55571746826172, + "distillation_loss": 3.0866899490356445, + "epoch": 4.63, + "learning_rate": 2.984408753639523e-05, + "loss": 108.9469, + "step": 5475, + "task_loss": 1.4193649291992188 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9980861984715937, + "compression/movement_sparsity/importance_threshold": -1.340375840228579e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9207965067340067, + "compression/movement_sparsity/model_sparsity": 0.8891643140263086, + "compression_loss": 105.55652618408203, + "distillation_loss": 4.663867950439453, + "epoch": 4.63, + "learning_rate": 2.9839391377852917e-05, + "loss": 109.8642, + "step": 5476, + "task_loss": 2.5201027393341064 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9980991880302509, + "compression/movement_sparsity/importance_threshold": -1.3312782978027599e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9208260428972408, + "compression/movement_sparsity/model_sparsity": 0.8891928355314717, + "compression_loss": 105.5572509765625, + "distillation_loss": 3.7670371532440186, + "epoch": 4.63, + "learning_rate": 2.9834695219310603e-05, + "loss": 109.7349, + "step": 5477, + "task_loss": 2.95670485496521 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.998112118679446, + "compression/movement_sparsity/importance_threshold": -1.3222220140019295e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.920875480496259, + "compression/movement_sparsity/model_sparsity": 0.8892405747968758, + "compression_loss": 105.55802154541016, + "distillation_loss": 3.4481446743011475, + "epoch": 4.63, + "learning_rate": 2.9829999060768293e-05, + "loss": 109.1789, + "step": 5478, + "task_loss": 2.270829916000366 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9981249905530645, + "compression/movement_sparsity/importance_threshold": -1.3132068950564776e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9208802501633134, + "compression/movement_sparsity/model_sparsity": 0.8892451806111936, + "compression_loss": 105.55878448486328, + "distillation_loss": 3.694906234741211, + "epoch": 4.63, + "learning_rate": 2.9825302902225983e-05, + "loss": 109.2548, + "step": 5479, + "task_loss": 2.381474018096924 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9981378037849913, + "compression/movement_sparsity/importance_threshold": -1.3042328471967075e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9209064356354417, + "compression/movement_sparsity/model_sparsity": 0.8892704665317984, + "compression_loss": 105.55953216552734, + "distillation_loss": 5.149961948394775, + "epoch": 4.63, + "learning_rate": 2.9820606743683666e-05, + "loss": 109.2785, + "step": 5480, + "task_loss": 2.5064005851745605 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9981505585091117, + "compression/movement_sparsity/importance_threshold": -1.2952997766530958e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9209269094312725, + "compression/movement_sparsity/model_sparsity": 0.8892902369897576, + "compression_loss": 105.56021881103516, + "distillation_loss": 4.603871822357178, + "epoch": 4.63, + "learning_rate": 2.9815910585141355e-05, + "loss": 109.3411, + "step": 5481, + "task_loss": 2.3995096683502197 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9981632548593108, + "compression/movement_sparsity/importance_threshold": -1.2864075896559454e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9209023456459426, + "compression/movement_sparsity/model_sparsity": 0.8892665170460209, + "compression_loss": 105.56086730957031, + "distillation_loss": 2.985311508178711, + "epoch": 4.63, + "learning_rate": 2.9811214426599042e-05, + "loss": 108.9121, + "step": 5482, + "task_loss": 1.6399996280670166 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9981758929694737, + "compression/movement_sparsity/importance_threshold": -1.2775561924357332e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9209357929361612, + "compression/movement_sparsity/model_sparsity": 0.8892988153189246, + "compression_loss": 105.56161499023438, + "distillation_loss": 3.9357495307922363, + "epoch": 4.63, + "learning_rate": 2.980651826805673e-05, + "loss": 109.5999, + "step": 5483, + "task_loss": 2.5850868225097656 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9981884729734857, + "compression/movement_sparsity/importance_threshold": -1.2687454912228488e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9208624116085301, + "compression/movement_sparsity/model_sparsity": 0.889227954865645, + "compression_loss": 105.56236267089844, + "distillation_loss": 4.823022365570068, + "epoch": 4.64, + "learning_rate": 2.980182210951442e-05, + "loss": 110.5117, + "step": 5484, + "task_loss": 2.3073902130126953 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9982009950052316, + "compression/movement_sparsity/importance_threshold": -1.259975392247769e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9208684213890186, + "compression/movement_sparsity/model_sparsity": 0.8892337581916855, + "compression_loss": 105.56307983398438, + "distillation_loss": 5.302607536315918, + "epoch": 4.64, + "learning_rate": 2.9797125950972104e-05, + "loss": 110.5084, + "step": 5485, + "task_loss": 2.736426830291748 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9982134591985969, + "compression/movement_sparsity/importance_threshold": -1.25124580174071e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9208960019887604, + "compression/movement_sparsity/model_sparsity": 0.8892603913129782, + "compression_loss": 105.56381225585938, + "distillation_loss": 4.095168590545654, + "epoch": 4.64, + "learning_rate": 2.9792429792429794e-05, + "loss": 109.5123, + "step": 5486, + "task_loss": 2.788638114929199 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9982258656874666, + "compression/movement_sparsity/importance_threshold": -1.2425566259321487e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9209203272907376, + "compression/movement_sparsity/model_sparsity": 0.8892838809659991, + "compression_loss": 105.56459045410156, + "distillation_loss": 6.452739238739014, + "epoch": 4.64, + "learning_rate": 2.9787733633887484e-05, + "loss": 110.244, + "step": 5487, + "task_loss": 3.432340383529663 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9982382146057258, + "compression/movement_sparsity/importance_threshold": -1.2339077710524747e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9209508889323883, + "compression/movement_sparsity/model_sparsity": 0.8893133927212405, + "compression_loss": 105.56527709960938, + "distillation_loss": 2.6238038539886475, + "epoch": 4.64, + "learning_rate": 2.978303747534517e-05, + "loss": 109.0877, + "step": 5488, + "task_loss": 1.5291496515274048 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9982505060872596, + "compression/movement_sparsity/importance_threshold": -1.2252991433322515e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9210213846114514, + "compression/movement_sparsity/model_sparsity": 0.8893814666568578, + "compression_loss": 105.5660629272461, + "distillation_loss": 4.410998344421387, + "epoch": 4.64, + "learning_rate": 2.9778341316802856e-05, + "loss": 109.4212, + "step": 5489, + "task_loss": 2.4471828937530518 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9982627402659533, + "compression/movement_sparsity/importance_threshold": -1.2167306490016086e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9210750195174776, + "compression/movement_sparsity/model_sparsity": 0.8894332590388616, + "compression_loss": 105.56681823730469, + "distillation_loss": 4.909655570983887, + "epoch": 4.64, + "learning_rate": 2.9773645158260543e-05, + "loss": 109.1968, + "step": 5490, + "task_loss": 3.52805495262146 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.998274917275692, + "compression/movement_sparsity/importance_threshold": -1.2082021942911095e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9210568113134976, + "compression/movement_sparsity/model_sparsity": 0.8894156763427034, + "compression_loss": 105.56758117675781, + "distillation_loss": 2.9336447715759277, + "epoch": 4.64, + "learning_rate": 2.9768948999718232e-05, + "loss": 109.1259, + "step": 5491, + "task_loss": 1.8460140228271484 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9982870372503607, + "compression/movement_sparsity/importance_threshold": -1.199713685431144e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9210429077340342, + "compression/movement_sparsity/model_sparsity": 0.8894022503939669, + "compression_loss": 105.5682601928711, + "distillation_loss": 2.950896739959717, + "epoch": 4.64, + "learning_rate": 2.9764252841175922e-05, + "loss": 109.2713, + "step": 5492, + "task_loss": 1.5536385774612427 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9982991003238446, + "compression/movement_sparsity/importance_threshold": -1.1912650286521019e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9210244133500309, + "compression/movement_sparsity/model_sparsity": 0.8893843913489496, + "compression_loss": 105.5689926147461, + "distillation_loss": 5.545666694641113, + "epoch": 4.64, + "learning_rate": 2.9759556682633605e-05, + "loss": 110.0599, + "step": 5493, + "task_loss": 2.71593976020813 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.998311106630029, + "compression/movement_sparsity/importance_threshold": -1.1828561301842865e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9210034506633271, + "compression/movement_sparsity/model_sparsity": 0.8893641487950228, + "compression_loss": 105.5696792602539, + "distillation_loss": 4.216726303100586, + "epoch": 4.64, + "learning_rate": 2.9754860524091295e-05, + "loss": 109.9911, + "step": 5494, + "task_loss": 3.4572525024414062 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9983230563027988, + "compression/movement_sparsity/importance_threshold": -1.1744868962581742e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9210672330360113, + "compression/movement_sparsity/model_sparsity": 0.8894257400469878, + "compression_loss": 105.57041931152344, + "distillation_loss": 5.391574382781982, + "epoch": 4.64, + "learning_rate": 2.975016436554898e-05, + "loss": 109.9368, + "step": 5495, + "task_loss": 2.361868143081665 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9983349494760394, + "compression/movement_sparsity/importance_threshold": -1.166157233104155e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9210615213597138, + "compression/movement_sparsity/model_sparsity": 0.8894202245843422, + "compression_loss": 105.5711669921875, + "distillation_loss": 4.1287946701049805, + "epoch": 4.65, + "learning_rate": 2.974546820700667e-05, + "loss": 110.1577, + "step": 5496, + "task_loss": 2.693920850753784 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9983467862836357, + "compression/movement_sparsity/importance_threshold": -1.1578670469526188e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9211191747102332, + "compression/movement_sparsity/model_sparsity": 0.8894758973649088, + "compression_loss": 105.57190704345703, + "distillation_loss": 4.5669169425964355, + "epoch": 4.65, + "learning_rate": 2.9740772048464354e-05, + "loss": 109.6205, + "step": 5497, + "task_loss": 3.3176538944244385 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.998358566859473, + "compression/movement_sparsity/importance_threshold": -1.1496162440339554e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9211239920739581, + "compression/movement_sparsity/model_sparsity": 0.8894805492373699, + "compression_loss": 105.57260131835938, + "distillation_loss": 4.57017707824707, + "epoch": 4.65, + "learning_rate": 2.9736075889922044e-05, + "loss": 109.625, + "step": 5498, + "task_loss": 2.720797538757324 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9983702913374364, + "compression/movement_sparsity/importance_threshold": -1.1414047305786414e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.921157952103385, + "compression/movement_sparsity/model_sparsity": 0.8895133426353127, + "compression_loss": 105.57333374023438, + "distillation_loss": 4.807487964630127, + "epoch": 4.65, + "learning_rate": 2.9731379731379733e-05, + "loss": 110.057, + "step": 5499, + "task_loss": 2.0919086933135986 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9983819598514109, + "compression/movement_sparsity/importance_threshold": -1.1332324128169799e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9211280105184514, + "compression/movement_sparsity/model_sparsity": 0.8894844296359325, + "compression_loss": 105.573974609375, + "distillation_loss": 4.656310558319092, + "epoch": 4.65, + "learning_rate": 2.9726683572837423e-05, + "loss": 109.6831, + "step": 5500, + "task_loss": 3.0060484409332275 + }, + { + "epoch": 4.65, + "eval_accuracy": 0.5844356435643564, + "eval_loss": 109.220458984375, + "eval_runtime": 227.6601, + "eval_samples_per_second": 110.911, + "eval_steps_per_second": 0.87, + "step": 5500 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9983935725352818, + "compression/movement_sparsity/importance_threshold": -1.1250991969794476e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9211420691120941, + "compression/movement_sparsity/model_sparsity": 0.8894980052736343, + "compression_loss": 105.57464599609375, + "distillation_loss": 5.109872817993164, + "epoch": 4.65, + "learning_rate": 2.972198741429511e-05, + "loss": 109.9574, + "step": 5501, + "task_loss": 3.02851939201355 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9984051295229343, + "compression/movement_sparsity/importance_threshold": -1.1170049892963475e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9211321124321181, + "compression/movement_sparsity/model_sparsity": 0.889488390636246, + "compression_loss": 105.57530212402344, + "distillation_loss": 5.3459954261779785, + "epoch": 4.65, + "learning_rate": 2.9717291255752792e-05, + "loss": 110.1277, + "step": 5502, + "task_loss": 3.3991212844848633 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9984166309482533, + "compression/movement_sparsity/importance_threshold": -1.1089496959981564e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9211259834099532, + "compression/movement_sparsity/model_sparsity": 0.8894824721648475, + "compression_loss": 105.57597351074219, + "distillation_loss": 4.142326354980469, + "epoch": 4.65, + "learning_rate": 2.9712595097210482e-05, + "loss": 109.6358, + "step": 5503, + "task_loss": 2.7353999614715576 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9984280769451241, + "compression/movement_sparsity/importance_threshold": -1.1009332233151772e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9211947500847093, + "compression/movement_sparsity/model_sparsity": 0.8895488764927747, + "compression_loss": 105.57666778564453, + "distillation_loss": 4.549707889556885, + "epoch": 4.65, + "learning_rate": 2.9707898938668172e-05, + "loss": 109.9483, + "step": 5504, + "task_loss": 2.590615749359131 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.998439467647432, + "compression/movement_sparsity/importance_threshold": -1.0929554774778867e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9212436272478487, + "compression/movement_sparsity/model_sparsity": 0.8895960745749965, + "compression_loss": 105.57730865478516, + "distillation_loss": 3.782606601715088, + "epoch": 4.65, + "learning_rate": 2.970320278012586e-05, + "loss": 109.6767, + "step": 5505, + "task_loss": 2.520418405532837 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9984508031890618, + "compression/movement_sparsity/importance_threshold": -1.0850163647167614e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9212841336453077, + "compression/movement_sparsity/model_sparsity": 0.8896351894530905, + "compression_loss": 105.5779037475586, + "distillation_loss": 3.771148681640625, + "epoch": 4.65, + "learning_rate": 2.9698506621583544e-05, + "loss": 109.2835, + "step": 5506, + "task_loss": 2.1486878395080566 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9984620837038989, + "compression/movement_sparsity/importance_threshold": -1.0771157912620177e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9212520457101996, + "compression/movement_sparsity/model_sparsity": 0.8896042038372675, + "compression_loss": 105.57848358154297, + "distillation_loss": 3.160004138946533, + "epoch": 4.65, + "learning_rate": 2.9693810463041234e-05, + "loss": 109.9779, + "step": 5507, + "task_loss": 2.5075628757476807 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9984733093258282, + "compression/movement_sparsity/importance_threshold": -1.069253663344219e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9212581151115262, + "compression/movement_sparsity/model_sparsity": 0.8896100647359868, + "compression_loss": 105.5790786743164, + "distillation_loss": 4.342315673828125, + "epoch": 4.66, + "learning_rate": 2.968911430449892e-05, + "loss": 109.3495, + "step": 5508, + "task_loss": 2.3489034175872803 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9984844801887351, + "compression/movement_sparsity/importance_threshold": -1.0614298871935816e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9211778177666664, + "compression/movement_sparsity/model_sparsity": 0.8895325258519464, + "compression_loss": 105.57966613769531, + "distillation_loss": 3.7558228969573975, + "epoch": 4.66, + "learning_rate": 2.968441814595661e-05, + "loss": 109.2396, + "step": 5509, + "task_loss": 2.336001396179199 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9984955964265047, + "compression/movement_sparsity/importance_threshold": -1.053644369040669e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9212110861943704, + "compression/movement_sparsity/model_sparsity": 0.8895646514068132, + "compression_loss": 105.5802001953125, + "distillation_loss": 4.306859970092773, + "epoch": 4.66, + "learning_rate": 2.9679721987414293e-05, + "loss": 109.8347, + "step": 5510, + "task_loss": 2.8960933685302734 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9985066581730219, + "compression/movement_sparsity/importance_threshold": -1.0458970151158709e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9211811923061073, + "compression/movement_sparsity/model_sparsity": 0.8895357844655762, + "compression_loss": 105.58077239990234, + "distillation_loss": 4.569491386413574, + "epoch": 4.66, + "learning_rate": 2.9675025828871983e-05, + "loss": 109.4031, + "step": 5511, + "task_loss": 2.2558350563049316 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9985176655621721, + "compression/movement_sparsity/importance_threshold": -1.0381877316494037e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9211969679798896, + "compression/movement_sparsity/model_sparsity": 0.8895510181964325, + "compression_loss": 105.58133697509766, + "distillation_loss": 3.9297678470611572, + "epoch": 4.66, + "learning_rate": 2.9670329670329673e-05, + "loss": 109.8766, + "step": 5512, + "task_loss": 1.3193987607955933 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9985286187278404, + "compression/movement_sparsity/importance_threshold": -1.0305164248718308e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9211955847764438, + "compression/movement_sparsity/model_sparsity": 0.8895496825102802, + "compression_loss": 105.58186340332031, + "distillation_loss": 5.221585273742676, + "epoch": 4.66, + "learning_rate": 2.9665633511787362e-05, + "loss": 109.6087, + "step": 5513, + "task_loss": 3.1311895847320557 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9985395178039117, + "compression/movement_sparsity/importance_threshold": -1.0228830010136289e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9212410754759747, + "compression/movement_sparsity/model_sparsity": 0.8895936104643364, + "compression_loss": 105.5824203491211, + "distillation_loss": 4.5852508544921875, + "epoch": 4.66, + "learning_rate": 2.966093735324505e-05, + "loss": 110.1361, + "step": 5514, + "task_loss": 1.674310564994812 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9985503629242715, + "compression/movement_sparsity/importance_threshold": -1.0152873663050142e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9212778853814665, + "compression/movement_sparsity/model_sparsity": 0.8896291558363342, + "compression_loss": 105.58287048339844, + "distillation_loss": 4.056064128875732, + "epoch": 4.66, + "learning_rate": 2.965624119470273e-05, + "loss": 109.6557, + "step": 5515, + "task_loss": 2.0885801315307617 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9985611542228048, + "compression/movement_sparsity/importance_threshold": -1.0077294269763767e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9212736880744588, + "compression/movement_sparsity/model_sparsity": 0.8896251027197345, + "compression_loss": 105.58338928222656, + "distillation_loss": 3.532503366470337, + "epoch": 4.66, + "learning_rate": 2.965154503616042e-05, + "loss": 109.4044, + "step": 5516, + "task_loss": 2.4017531871795654 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9985718918333966, + "compression/movement_sparsity/importance_threshold": -1.0002090892582798e-05, + "compression/movement_sparsity/linear_layer_sparsity": 0.9213254747345012, + "compression/movement_sparsity/model_sparsity": 0.8896751103486902, + "compression_loss": 105.58386993408203, + "distillation_loss": 3.244553327560425, + "epoch": 4.66, + "learning_rate": 2.964684887761811e-05, + "loss": 108.9161, + "step": 5517, + "task_loss": 2.21809458732605 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9985825758899322, + "compression/movement_sparsity/importance_threshold": -9.927262593810264e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9213226129342686, + "compression/movement_sparsity/model_sparsity": 0.8896723468600995, + "compression_loss": 105.58434295654297, + "distillation_loss": 4.375256061553955, + "epoch": 4.66, + "learning_rate": 2.96421527190758e-05, + "loss": 109.0316, + "step": 5518, + "task_loss": 3.2320735454559326 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9985932065262967, + "compression/movement_sparsity/importance_threshold": -9.852808435749198e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.921392285845765, + "compression/movement_sparsity/model_sparsity": 0.889739626292747, + "compression_loss": 105.58478546142578, + "distillation_loss": 3.3518385887145996, + "epoch": 4.66, + "learning_rate": 2.9637456560533484e-05, + "loss": 109.2038, + "step": 5519, + "task_loss": 2.1962645053863525 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9986037838763752, + "compression/movement_sparsity/importance_threshold": -9.778727480706101e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9214746579957935, + "compression/movement_sparsity/model_sparsity": 0.8898191687060157, + "compression_loss": 105.58521270751953, + "distillation_loss": 3.33717679977417, + "epoch": 4.67, + "learning_rate": 2.9632760401991173e-05, + "loss": 109.577, + "step": 5520, + "task_loss": 1.2198176383972168 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9986143080740528, + "compression/movement_sparsity/importance_threshold": -9.705018790982269e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9214571294693689, + "compression/movement_sparsity/model_sparsity": 0.8898022423383978, + "compression_loss": 105.58563232421875, + "distillation_loss": 5.187948226928711, + "epoch": 4.67, + "learning_rate": 2.962806424344886e-05, + "loss": 109.8002, + "step": 5521, + "task_loss": 1.6369948387145996 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9986247792532149, + "compression/movement_sparsity/importance_threshold": -9.631681428882467e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9214765897109506, + "compression/movement_sparsity/model_sparsity": 0.8898210340608144, + "compression_loss": 105.58607482910156, + "distillation_loss": 4.363204002380371, + "epoch": 4.67, + "learning_rate": 2.962336808490655e-05, + "loss": 109.8458, + "step": 5522, + "task_loss": 2.6628644466400146 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9986351975477463, + "compression/movement_sparsity/importance_threshold": -9.55871445671233e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9215207210553709, + "compression/movement_sparsity/model_sparsity": 0.88986364935779, + "compression_loss": 105.58650970458984, + "distillation_loss": 4.051202774047852, + "epoch": 4.67, + "learning_rate": 2.9618671926364233e-05, + "loss": 109.2223, + "step": 5523, + "task_loss": 2.1149940490722656 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9986455630915324, + "compression/movement_sparsity/importance_threshold": -9.486116936774022e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9214801192645707, + "compression/movement_sparsity/model_sparsity": 0.8898244423634096, + "compression_loss": 105.58686828613281, + "distillation_loss": 4.639266490936279, + "epoch": 4.67, + "learning_rate": 2.9613975767821922e-05, + "loss": 110.0244, + "step": 5524, + "task_loss": 2.3971798419952393 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9986558760184582, + "compression/movement_sparsity/importance_threshold": -9.413887931371441e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9215139242798185, + "compression/movement_sparsity/model_sparsity": 0.8898570860723871, + "compression_loss": 105.58724975585938, + "distillation_loss": 4.142387390136719, + "epoch": 4.67, + "learning_rate": 2.9609279609279612e-05, + "loss": 109.369, + "step": 5525, + "task_loss": 3.0739452838897705 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9986661364624089, + "compression/movement_sparsity/importance_threshold": -9.34202650281022e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9216057045981116, + "compression/movement_sparsity/model_sparsity": 0.8899457134543977, + "compression_loss": 105.587646484375, + "distillation_loss": 3.021927833557129, + "epoch": 4.67, + "learning_rate": 2.9604583450737298e-05, + "loss": 109.323, + "step": 5526, + "task_loss": 1.3770098686218262 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9986763445572695, + "compression/movement_sparsity/importance_threshold": -9.270531713392524e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9216087214125236, + "compression/movement_sparsity/model_sparsity": 0.8899486266319537, + "compression_loss": 105.58799743652344, + "distillation_loss": 4.188009738922119, + "epoch": 4.67, + "learning_rate": 2.9599887292194985e-05, + "loss": 110.2321, + "step": 5527, + "task_loss": 2.551593780517578 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9986865004369254, + "compression/movement_sparsity/importance_threshold": -9.199402625423118e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9215627060496168, + "compression/movement_sparsity/model_sparsity": 0.8899041920383226, + "compression_loss": 105.58845520019531, + "distillation_loss": 3.984070301055908, + "epoch": 4.67, + "learning_rate": 2.959519113365267e-05, + "loss": 109.0129, + "step": 5528, + "task_loss": 3.040581703186035 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9986966042352615, + "compression/movement_sparsity/importance_threshold": -9.12863830120677e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9216025327695205, + "compression/movement_sparsity/model_sparsity": 0.8899426505878764, + "compression_loss": 105.58878326416016, + "distillation_loss": 3.0336554050445557, + "epoch": 4.67, + "learning_rate": 2.959049497511036e-05, + "loss": 108.6966, + "step": 5529, + "task_loss": 1.4262202978134155 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9987066560861632, + "compression/movement_sparsity/importance_threshold": -9.05823780304564e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9215813912203021, + "compression/movement_sparsity/model_sparsity": 0.8899222353159126, + "compression_loss": 105.58914184570312, + "distillation_loss": 3.816352128982544, + "epoch": 4.67, + "learning_rate": 2.958579881656805e-05, + "loss": 109.781, + "step": 5530, + "task_loss": 1.7347720861434937 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9987166561235153, + "compression/movement_sparsity/importance_threshold": -8.988200193244499e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9215621694620731, + "compression/movement_sparsity/model_sparsity": 0.8899036738842119, + "compression_loss": 105.58946990966797, + "distillation_loss": 4.625051021575928, + "epoch": 4.67, + "learning_rate": 2.958110265802574e-05, + "loss": 109.7069, + "step": 5531, + "task_loss": 2.8517322540283203 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9987266044812032, + "compression/movement_sparsity/importance_threshold": -8.918524534108109e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9216012449594159, + "compression/movement_sparsity/model_sparsity": 0.8899414070180105, + "compression_loss": 105.5898208618164, + "distillation_loss": 4.2481160163879395, + "epoch": 4.68, + "learning_rate": 2.9576406499483423e-05, + "loss": 109.6789, + "step": 5532, + "task_loss": 2.5806219577789307 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.998736501293112, + "compression/movement_sparsity/importance_threshold": -8.849209887939503e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9216336071503796, + "compression/movement_sparsity/model_sparsity": 0.8899726574681569, + "compression_loss": 105.5901870727539, + "distillation_loss": 4.160010814666748, + "epoch": 4.68, + "learning_rate": 2.957171034094111e-05, + "loss": 109.6953, + "step": 5533, + "task_loss": 2.2042551040649414 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9987463466931267, + "compression/movement_sparsity/importance_threshold": -8.780255317044315e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9216428006836268, + "compression/movement_sparsity/model_sparsity": 0.8899815351752545, + "compression_loss": 105.59048461914062, + "distillation_loss": 3.8082284927368164, + "epoch": 4.68, + "learning_rate": 2.95670141823988e-05, + "loss": 108.9387, + "step": 5534, + "task_loss": 2.353160858154297 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9987561408151326, + "compression/movement_sparsity/importance_threshold": -8.71165988372384e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9216554522254885, + "compression/movement_sparsity/model_sparsity": 0.8899937520977326, + "compression_loss": 105.59085845947266, + "distillation_loss": 3.0033059120178223, + "epoch": 4.68, + "learning_rate": 2.956231802385649e-05, + "loss": 109.463, + "step": 5535, + "task_loss": 2.2943849563598633 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9987658837930148, + "compression/movement_sparsity/importance_threshold": -8.643422650283714e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9216439573278875, + "compression/movement_sparsity/model_sparsity": 0.8899826520852265, + "compression_loss": 105.59125518798828, + "distillation_loss": 4.7117719650268555, + "epoch": 4.68, + "learning_rate": 2.9557621865314172e-05, + "loss": 110.2154, + "step": 5536, + "task_loss": 3.2189416885375977 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9987755757606583, + "compression/movement_sparsity/importance_threshold": -8.5755426790287e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9216480473173866, + "compression/movement_sparsity/model_sparsity": 0.8899866015710041, + "compression_loss": 105.5915756225586, + "distillation_loss": 5.699721336364746, + "epoch": 4.68, + "learning_rate": 2.955292570677186e-05, + "loss": 109.3269, + "step": 5537, + "task_loss": 3.0598111152648926 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9987852168519485, + "compression/movement_sparsity/importance_threshold": -8.508019032260096e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.921680647991703, + "compression/movement_sparsity/model_sparsity": 0.8900180823118664, + "compression_loss": 105.59193420410156, + "distillation_loss": 3.4482791423797607, + "epoch": 4.68, + "learning_rate": 2.954822954822955e-05, + "loss": 109.3828, + "step": 5538, + "task_loss": 1.66573965549469 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9987948072007704, + "compression/movement_sparsity/importance_threshold": -8.440850772284403e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9216533535719845, + "compression/movement_sparsity/model_sparsity": 0.8899917255394327, + "compression_loss": 105.59231567382812, + "distillation_loss": 3.452169895172119, + "epoch": 4.68, + "learning_rate": 2.9543533389687238e-05, + "loss": 109.6048, + "step": 5539, + "task_loss": 2.3701376914978027 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9988043469410092, + "compression/movement_sparsity/importance_threshold": -8.374036961404652e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.921644005024558, + "compression/movement_sparsity/model_sparsity": 0.8899826981433697, + "compression_loss": 105.5926284790039, + "distillation_loss": 3.420638084411621, + "epoch": 4.68, + "learning_rate": 2.9538837231144924e-05, + "loss": 109.3162, + "step": 5540, + "task_loss": 2.4548180103302 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9988138362065498, + "compression/movement_sparsity/importance_threshold": -8.307576661924741e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.921613848804607, + "compression/movement_sparsity/model_sparsity": 0.8899535778823454, + "compression_loss": 105.593017578125, + "distillation_loss": 3.7270278930664062, + "epoch": 4.68, + "learning_rate": 2.953414107260261e-05, + "loss": 109.1725, + "step": 5541, + "task_loss": 2.056856393814087 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9988232751312777, + "compression/movement_sparsity/importance_threshold": -8.241468936149438e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9215867690199059, + "compression/movement_sparsity/model_sparsity": 0.8899274283715559, + "compression_loss": 105.59333801269531, + "distillation_loss": 4.259583950042725, + "epoch": 4.68, + "learning_rate": 2.95294449140603e-05, + "loss": 109.4983, + "step": 5542, + "task_loss": 2.125389814376831 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9988326638490778, + "compression/movement_sparsity/importance_threshold": -8.175712846381772e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9216080298108007, + "compression/movement_sparsity/model_sparsity": 0.8899479587888777, + "compression_loss": 105.59367370605469, + "distillation_loss": 4.642014026641846, + "epoch": 4.69, + "learning_rate": 2.952474875551799e-05, + "loss": 109.8015, + "step": 5543, + "task_loss": 2.4712116718292236 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9988420024938354, + "compression/movement_sparsity/importance_threshold": -8.110307454924776e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9216659693413434, + "compression/movement_sparsity/model_sparsity": 0.8900039079183033, + "compression_loss": 105.59407043457031, + "distillation_loss": 3.6531600952148438, + "epoch": 4.69, + "learning_rate": 2.952005259697568e-05, + "loss": 109.3419, + "step": 5544, + "task_loss": 1.648470401763916 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9988512911994354, + "compression/movement_sparsity/importance_threshold": -8.04525182408495e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9216930968227149, + "compression/movement_sparsity/model_sparsity": 0.8900301034872359, + "compression_loss": 105.59442138671875, + "distillation_loss": 2.666248321533203, + "epoch": 4.69, + "learning_rate": 2.9515356438433362e-05, + "loss": 109.2488, + "step": 5545, + "task_loss": 1.1622225046157837 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9988605300997632, + "compression/movement_sparsity/importance_threshold": -7.980545016164457e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9217111977091862, + "compression/movement_sparsity/model_sparsity": 0.890047582552572, + "compression_loss": 105.59477233886719, + "distillation_loss": 4.48745059967041, + "epoch": 4.69, + "learning_rate": 2.951066027989105e-05, + "loss": 109.6328, + "step": 5546, + "task_loss": 3.091247081756592 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9988697193287037, + "compression/movement_sparsity/importance_threshold": -7.916186093468065e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9217097191123993, + "compression/movement_sparsity/model_sparsity": 0.8900461547501335, + "compression_loss": 105.59513854980469, + "distillation_loss": 4.150511264801025, + "epoch": 4.69, + "learning_rate": 2.950596412134874e-05, + "loss": 109.844, + "step": 5547, + "task_loss": 3.1403212547302246 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9988788590201423, + "compression/movement_sparsity/importance_threshold": -7.852174118299672e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9216973418263933, + "compression/movement_sparsity/model_sparsity": 0.8900342026619787, + "compression_loss": 105.59547424316406, + "distillation_loss": 3.0130043029785156, + "epoch": 4.69, + "learning_rate": 2.9501267962806428e-05, + "loss": 109.4778, + "step": 5548, + "task_loss": 1.7208449840545654 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9988879493079639, + "compression/movement_sparsity/importance_threshold": -7.788508152964044e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.921714345689442, + "compression/movement_sparsity/model_sparsity": 0.8900506223900218, + "compression_loss": 105.59584045410156, + "distillation_loss": 3.5317206382751465, + "epoch": 4.69, + "learning_rate": 2.949657180426411e-05, + "loss": 109.9433, + "step": 5549, + "task_loss": 1.816392183303833 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9988969903260538, + "compression/movement_sparsity/importance_threshold": -7.725187259762477e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9217758863186107, + "compression/movement_sparsity/model_sparsity": 0.8901100489092574, + "compression_loss": 105.59619903564453, + "distillation_loss": 5.341182231903076, + "epoch": 4.69, + "learning_rate": 2.94918756457218e-05, + "loss": 109.7352, + "step": 5550, + "task_loss": 2.734313488006592 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9989059822082972, + "compression/movement_sparsity/importance_threshold": -7.662210501001472e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9217829692741865, + "compression/movement_sparsity/model_sparsity": 0.8901168885435193, + "compression_loss": 105.59648895263672, + "distillation_loss": 4.024999618530273, + "epoch": 4.69, + "learning_rate": 2.948717948717949e-05, + "loss": 108.9484, + "step": 5551, + "task_loss": 2.456216812133789 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9989149250885789, + "compression/movement_sparsity/importance_threshold": -7.599576938984061e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9217955611952099, + "compression/movement_sparsity/model_sparsity": 0.8901290478933184, + "compression_loss": 105.5968246459961, + "distillation_loss": 3.687828540802002, + "epoch": 4.69, + "learning_rate": 2.9482483328637177e-05, + "loss": 109.1914, + "step": 5552, + "task_loss": 2.942133903503418 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9989238191007845, + "compression/movement_sparsity/importance_threshold": -7.537285636014142e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9217559133378206, + "compression/movement_sparsity/model_sparsity": 0.8900907620618015, + "compression_loss": 105.59710693359375, + "distillation_loss": 4.101009368896484, + "epoch": 4.69, + "learning_rate": 2.947778717009486e-05, + "loss": 109.3429, + "step": 5553, + "task_loss": 2.2548351287841797 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9989326643787988, + "compression/movement_sparsity/importance_threshold": -7.475335654396481e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9217146795661358, + "compression/movement_sparsity/model_sparsity": 0.890050944797024, + "compression_loss": 105.5974349975586, + "distillation_loss": 4.991859436035156, + "epoch": 4.69, + "learning_rate": 2.947309101155255e-05, + "loss": 110.481, + "step": 5554, + "task_loss": 2.8656604290008545 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9989414610565072, + "compression/movement_sparsity/importance_threshold": -7.41372605643411e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.921726722975448, + "compression/movement_sparsity/model_sparsity": 0.8900625744781765, + "compression_loss": 105.5978012084961, + "distillation_loss": 3.668736219406128, + "epoch": 4.7, + "learning_rate": 2.946839485301024e-05, + "loss": 110.1701, + "step": 5555, + "task_loss": 1.8532589673995972 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9989502092677945, + "compression/movement_sparsity/importance_threshold": -7.352455904430927e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9217874050645469, + "compression/movement_sparsity/model_sparsity": 0.890121171950835, + "compression_loss": 105.59812927246094, + "distillation_loss": 3.6224679946899414, + "epoch": 4.7, + "learning_rate": 2.946369869446793e-05, + "loss": 109.897, + "step": 5556, + "task_loss": 1.8085417747497559 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9989589091465461, + "compression/movement_sparsity/importance_threshold": -7.291524260693433e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9218594389612352, + "compression/movement_sparsity/model_sparsity": 0.8901907312615698, + "compression_loss": 105.59845733642578, + "distillation_loss": 4.960972785949707, + "epoch": 4.7, + "learning_rate": 2.9459002535925612e-05, + "loss": 109.9854, + "step": 5557, + "task_loss": 3.8202719688415527 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9989675608266471, + "compression/movement_sparsity/importance_threshold": -7.230930187522057e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.921879125762002, + "compression/movement_sparsity/model_sparsity": 0.8902097417601665, + "compression_loss": 105.59879302978516, + "distillation_loss": 3.8254401683807373, + "epoch": 4.7, + "learning_rate": 2.9454306377383302e-05, + "loss": 109.4549, + "step": 5558, + "task_loss": 1.8766640424728394 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9989761644419826, + "compression/movement_sparsity/importance_threshold": -7.170672747222433e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9219141351181809, + "compression/movement_sparsity/model_sparsity": 0.8902435484372593, + "compression_loss": 105.59912109375, + "distillation_loss": 5.655785083770752, + "epoch": 4.7, + "learning_rate": 2.9449610218840988e-05, + "loss": 109.5628, + "step": 5559, + "task_loss": 3.0628976821899414 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9989847201264378, + "compression/movement_sparsity/importance_threshold": -7.110751002097591e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9218850401491494, + "compression/movement_sparsity/model_sparsity": 0.8902154529699207, + "compression_loss": 105.59945678710938, + "distillation_loss": 4.490151405334473, + "epoch": 4.7, + "learning_rate": 2.9444914060298678e-05, + "loss": 109.6885, + "step": 5560, + "task_loss": 3.1283621788024902 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9989932280138978, + "compression/movement_sparsity/importance_threshold": -7.0511640144531665e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9218889035794634, + "compression/movement_sparsity/model_sparsity": 0.890219183679518, + "compression_loss": 105.59978485107422, + "distillation_loss": 3.3454227447509766, + "epoch": 4.7, + "learning_rate": 2.9440217901756368e-05, + "loss": 109.3777, + "step": 5561, + "task_loss": 1.4451587200164795 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9990016882382476, + "compression/movement_sparsity/importance_threshold": -6.991910846592189e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9219447683048374, + "compression/movement_sparsity/model_sparsity": 0.8902731292797155, + "compression_loss": 105.6000747680664, + "distillation_loss": 3.354673147201538, + "epoch": 4.7, + "learning_rate": 2.943552174321405e-05, + "loss": 109.6329, + "step": 5562, + "task_loss": 2.4315085411071777 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9990101009333726, + "compression/movement_sparsity/importance_threshold": -6.932990560818558e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9219354078332432, + "compression/movement_sparsity/model_sparsity": 0.8902640903691168, + "compression_loss": 105.60040283203125, + "distillation_loss": 4.514801502227783, + "epoch": 4.7, + "learning_rate": 2.943082558467174e-05, + "loss": 109.8817, + "step": 5563, + "task_loss": 2.743793249130249 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9990184662331578, + "compression/movement_sparsity/importance_threshold": -6.87440221943704e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.92196210604458, + "compression/movement_sparsity/model_sparsity": 0.8902898714147607, + "compression_loss": 105.60069274902344, + "distillation_loss": 5.476081848144531, + "epoch": 4.7, + "learning_rate": 2.942612942612943e-05, + "loss": 110.2584, + "step": 5564, + "task_loss": 2.48602032661438 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9990267842714884, + "compression/movement_sparsity/importance_threshold": -6.816144884750665e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9219440170822764, + "compression/movement_sparsity/model_sparsity": 0.8902724038639604, + "compression_loss": 105.60093688964844, + "distillation_loss": 4.484657287597656, + "epoch": 4.7, + "learning_rate": 2.9421433267587116e-05, + "loss": 110.5321, + "step": 5565, + "task_loss": 2.4490067958831787 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9990350551822496, + "compression/movement_sparsity/importance_threshold": -6.758217619062465e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9219091388919415, + "compression/movement_sparsity/model_sparsity": 0.8902387238467614, + "compression_loss": 105.601318359375, + "distillation_loss": 5.056672096252441, + "epoch": 4.7, + "learning_rate": 2.94167371090448e-05, + "loss": 110.1083, + "step": 5566, + "task_loss": 2.8924527168273926 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9990432790993262, + "compression/movement_sparsity/importance_threshold": -6.700619484679808e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9219169253734077, + "compression/movement_sparsity/model_sparsity": 0.8902462428386352, + "compression_loss": 105.6015853881836, + "distillation_loss": 4.894302845001221, + "epoch": 4.71, + "learning_rate": 2.941204095050249e-05, + "loss": 109.9424, + "step": 5567, + "task_loss": 1.9469314813613892 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9990514561566037, + "compression/movement_sparsity/importance_threshold": -6.643349543903124e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.922036012035587, + "compression/movement_sparsity/model_sparsity": 0.8903612385076153, + "compression_loss": 105.60186767578125, + "distillation_loss": 3.305497646331787, + "epoch": 4.71, + "learning_rate": 2.940734479196018e-05, + "loss": 109.741, + "step": 5568, + "task_loss": 2.193976402282715 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9990595864879671, + "compression/movement_sparsity/importance_threshold": -6.5864068590389124e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9219974850499556, + "compression/movement_sparsity/model_sparsity": 0.8903240350424632, + "compression_loss": 105.60218811035156, + "distillation_loss": 4.102393627166748, + "epoch": 4.71, + "learning_rate": 2.940264863341787e-05, + "loss": 109.6391, + "step": 5569, + "task_loss": 2.1778347492218018 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9990676702273016, + "compression/movement_sparsity/importance_threshold": -6.529790492389338e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9219882199717025, + "compression/movement_sparsity/model_sparsity": 0.8903150882481508, + "compression_loss": 105.60255432128906, + "distillation_loss": 5.236992835998535, + "epoch": 4.71, + "learning_rate": 2.939795247487555e-05, + "loss": 109.8669, + "step": 5570, + "task_loss": 3.277637481689453 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9990757075084923, + "compression/movement_sparsity/importance_threshold": -6.473499506259167e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9220859385254784, + "compression/movement_sparsity/model_sparsity": 0.8904094498689871, + "compression_loss": 105.602783203125, + "distillation_loss": 5.6884765625, + "epoch": 4.71, + "learning_rate": 2.939325631633324e-05, + "loss": 109.9738, + "step": 5571, + "task_loss": 3.4811792373657227 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9990836984654243, + "compression/movement_sparsity/importance_threshold": -6.417532962952298e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.92210802208394, + "compression/movement_sparsity/model_sparsity": 0.8904307747892786, + "compression_loss": 105.60305786132812, + "distillation_loss": 6.012972831726074, + "epoch": 4.71, + "learning_rate": 2.9388560157790928e-05, + "loss": 109.6622, + "step": 5572, + "task_loss": 3.3200559616088867 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9990916432319829, + "compression/movement_sparsity/importance_threshold": -6.361889924772629e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9221484927088962, + "compression/movement_sparsity/model_sparsity": 0.8904698551237652, + "compression_loss": 105.60332489013672, + "distillation_loss": 5.538825988769531, + "epoch": 4.71, + "learning_rate": 2.9383863999248617e-05, + "loss": 110.2575, + "step": 5573, + "task_loss": 3.4202160835266113 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9990995419420531, + "compression/movement_sparsity/importance_threshold": -6.306569454024059e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.922173974655134, + "compression/movement_sparsity/model_sparsity": 0.8904944616867582, + "compression_loss": 105.6036376953125, + "distillation_loss": 3.3037567138671875, + "epoch": 4.71, + "learning_rate": 2.93791678407063e-05, + "loss": 109.9624, + "step": 5574, + "task_loss": 1.8733536005020142 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9991073947295199, + "compression/movement_sparsity/importance_threshold": -6.251570613011355e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9221768483795343, + "compression/movement_sparsity/model_sparsity": 0.8904972366898846, + "compression_loss": 105.60387420654297, + "distillation_loss": 3.0969977378845215, + "epoch": 4.71, + "learning_rate": 2.937447168216399e-05, + "loss": 108.7325, + "step": 5575, + "task_loss": 1.4023114442825317 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9991152017282688, + "compression/movement_sparsity/importance_threshold": -6.196892464037547e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9222409765530799, + "compression/movement_sparsity/model_sparsity": 0.8905591618633877, + "compression_loss": 105.60407257080078, + "distillation_loss": 3.2210378646850586, + "epoch": 4.71, + "learning_rate": 2.936977552362168e-05, + "loss": 109.6591, + "step": 5576, + "task_loss": 1.3083778619766235 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9991229630721846, + "compression/movement_sparsity/importance_threshold": -6.1425340694074015e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9222008040323146, + "compression/movement_sparsity/model_sparsity": 0.8905203693922958, + "compression_loss": 105.60431671142578, + "distillation_loss": 4.883753299713135, + "epoch": 4.71, + "learning_rate": 2.9365079365079366e-05, + "loss": 109.7846, + "step": 5577, + "task_loss": 2.3021059036254883 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9991306788951527, + "compression/movement_sparsity/importance_threshold": -6.08849449142395e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9222622492681423, + "compression/movement_sparsity/model_sparsity": 0.8905797037952451, + "compression_loss": 105.60458374023438, + "distillation_loss": 3.5930442810058594, + "epoch": 4.71, + "learning_rate": 2.9360383206537056e-05, + "loss": 109.3932, + "step": 5578, + "task_loss": 1.856186866760254 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9991383493310582, + "compression/movement_sparsity/importance_threshold": -6.034772792391958e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9223005735429239, + "compression/movement_sparsity/model_sparsity": 0.8906167115132888, + "compression_loss": 105.6048583984375, + "distillation_loss": 4.280333518981934, + "epoch": 4.72, + "learning_rate": 2.935568704799474e-05, + "loss": 109.6598, + "step": 5579, + "task_loss": 2.888791799545288 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9991459745137861, + "compression/movement_sparsity/importance_threshold": -5.981368034614458e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9222862764659285, + "compression/movement_sparsity/model_sparsity": 0.8906029055848711, + "compression_loss": 105.60505676269531, + "distillation_loss": 4.122594833374023, + "epoch": 4.72, + "learning_rate": 2.935099088945243e-05, + "loss": 109.5739, + "step": 5580, + "task_loss": 2.5315141677856445 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9991535545772214, + "compression/movement_sparsity/importance_threshold": -5.9282792803979495e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9222529484173864, + "compression/movement_sparsity/model_sparsity": 0.8905707224573254, + "compression_loss": 105.60533142089844, + "distillation_loss": 5.100676536560059, + "epoch": 4.72, + "learning_rate": 2.9346294730910118e-05, + "loss": 109.7613, + "step": 5581, + "task_loss": 3.0022695064544678 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9991610896552497, + "compression/movement_sparsity/importance_threshold": -5.875505592042862e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9222779176244158, + "compression/movement_sparsity/model_sparsity": 0.8905948338952792, + "compression_loss": 105.6055679321289, + "distillation_loss": 4.076694965362549, + "epoch": 4.72, + "learning_rate": 2.9341598572367808e-05, + "loss": 109.3416, + "step": 5582, + "task_loss": 3.1584131717681885 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9991685798817559, + "compression/movement_sparsity/importance_threshold": -5.82304603185483e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9223399471444576, + "compression/movement_sparsity/model_sparsity": 0.8906547325104823, + "compression_loss": 105.60575103759766, + "distillation_loss": 5.687671661376953, + "epoch": 4.72, + "learning_rate": 2.933690241382549e-05, + "loss": 110.1453, + "step": 5583, + "task_loss": 4.006703853607178 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.999176025390625, + "compression/movement_sparsity/importance_threshold": -5.770899662138618e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9223615298878785, + "compression/movement_sparsity/model_sparsity": 0.8906755738202705, + "compression_loss": 105.6059341430664, + "distillation_loss": 4.237778186798096, + "epoch": 4.72, + "learning_rate": 2.933220625528318e-05, + "loss": 109.5858, + "step": 5584, + "task_loss": 4.231777191162109 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9991834263157423, + "compression/movement_sparsity/importance_threshold": -5.719065545197258e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9223449195223618, + "compression/movement_sparsity/model_sparsity": 0.8906595340719087, + "compression_loss": 105.60612487792969, + "distillation_loss": 5.169844627380371, + "epoch": 4.72, + "learning_rate": 2.9327510096740867e-05, + "loss": 109.8318, + "step": 5585, + "task_loss": 4.575788497924805 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9991907827909929, + "compression/movement_sparsity/importance_threshold": -5.667542743334648e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9223528729421748, + "compression/movement_sparsity/model_sparsity": 0.8906672142672837, + "compression_loss": 105.60625457763672, + "distillation_loss": 4.209450721740723, + "epoch": 4.72, + "learning_rate": 2.9322813938198557e-05, + "loss": 110.5658, + "step": 5586, + "task_loss": 2.2490382194519043 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9991980949502621, + "compression/movement_sparsity/importance_threshold": -5.616330318854688e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9223205703720493, + "compression/movement_sparsity/model_sparsity": 0.8906360213898162, + "compression_loss": 105.60641479492188, + "distillation_loss": 4.804665565490723, + "epoch": 4.72, + "learning_rate": 2.931811777965624e-05, + "loss": 109.2357, + "step": 5587, + "task_loss": 2.4421379566192627 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9992053629274348, + "compression/movement_sparsity/importance_threshold": -5.565427334063011e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9222619988606219, + "compression/movement_sparsity/model_sparsity": 0.8905794619899934, + "compression_loss": 105.60655975341797, + "distillation_loss": 3.900477886199951, + "epoch": 4.72, + "learning_rate": 2.931342162111393e-05, + "loss": 109.8548, + "step": 5588, + "task_loss": 2.1990997791290283 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9992125868563962, + "compression/movement_sparsity/importance_threshold": -5.514832851260913e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9222659696084446, + "compression/movement_sparsity/model_sparsity": 0.890583296330413, + "compression_loss": 105.60664367675781, + "distillation_loss": 4.9571123123168945, + "epoch": 4.72, + "learning_rate": 2.930872546257162e-05, + "loss": 109.2734, + "step": 5589, + "task_loss": 2.4132933616638184 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9992197668710315, + "compression/movement_sparsity/importance_threshold": -5.464545932754895e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9222598167379445, + "compression/movement_sparsity/model_sparsity": 0.890577354829943, + "compression_loss": 105.60681915283203, + "distillation_loss": 3.153958320617676, + "epoch": 4.72, + "learning_rate": 2.9304029304029305e-05, + "loss": 109.3255, + "step": 5590, + "task_loss": 1.3000088930130005 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9992269031052259, + "compression/movement_sparsity/importance_threshold": -5.4145656408471216e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9222596497995976, + "compression/movement_sparsity/model_sparsity": 0.8905771936264419, + "compression_loss": 105.60693359375, + "distillation_loss": 3.181910991668701, + "epoch": 4.73, + "learning_rate": 2.9299333145486995e-05, + "loss": 109.0583, + "step": 5591, + "task_loss": 1.8961840867996216 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9992339956928643, + "compression/movement_sparsity/importance_threshold": -5.364891037843225e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9222647056466753, + "compression/movement_sparsity/model_sparsity": 0.8905820757896188, + "compression_loss": 105.60704803466797, + "distillation_loss": 4.400503158569336, + "epoch": 4.73, + "learning_rate": 2.9294636986944678e-05, + "loss": 109.7531, + "step": 5592, + "task_loss": 2.0076029300689697 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9992410447678322, + "compression/movement_sparsity/importance_threshold": -5.315521186044503e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9222514340480965, + "compression/movement_sparsity/model_sparsity": 0.8905692601112795, + "compression_loss": 105.60713195800781, + "distillation_loss": 5.280132293701172, + "epoch": 4.73, + "learning_rate": 2.9289940828402368e-05, + "loss": 109.7759, + "step": 5593, + "task_loss": 2.8329017162323 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9992480504640143, + "compression/movement_sparsity/importance_threshold": -5.266455147758323e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9222018891315695, + "compression/movement_sparsity/model_sparsity": 0.8905214172150532, + "compression_loss": 105.60722351074219, + "distillation_loss": 3.7911579608917236, + "epoch": 4.73, + "learning_rate": 2.9285244669860057e-05, + "loss": 109.4872, + "step": 5594, + "task_loss": 2.000415802001953 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9992550129152962, + "compression/movement_sparsity/importance_threshold": -5.217691985285981e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9221883671254705, + "compression/movement_sparsity/model_sparsity": 0.8905083597314621, + "compression_loss": 105.60737609863281, + "distillation_loss": 4.422634124755859, + "epoch": 4.73, + "learning_rate": 2.9280548511317747e-05, + "loss": 109.5805, + "step": 5595, + "task_loss": 2.5590217113494873 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9992619322555627, + "compression/movement_sparsity/importance_threshold": -5.169230760932245e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9222048105526404, + "compression/movement_sparsity/model_sparsity": 0.8905242382763228, + "compression_loss": 105.60743713378906, + "distillation_loss": 3.7626872062683105, + "epoch": 4.73, + "learning_rate": 2.927585235277543e-05, + "loss": 109.7688, + "step": 5596, + "task_loss": 2.32246732711792 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.999268808618699, + "compression/movement_sparsity/importance_threshold": -5.121070537002746e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9221765264270081, + "compression/movement_sparsity/model_sparsity": 0.8904969257974181, + "compression_loss": 105.6075668334961, + "distillation_loss": 3.9103686809539795, + "epoch": 4.73, + "learning_rate": 2.9271156194233117e-05, + "loss": 110.2285, + "step": 5597, + "task_loss": 2.6917808055877686 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9992756421385904, + "compression/movement_sparsity/importance_threshold": -5.073210375798783e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9222107010914524, + "compression/movement_sparsity/model_sparsity": 0.8905299264570053, + "compression_loss": 105.60763549804688, + "distillation_loss": 3.9632201194763184, + "epoch": 4.73, + "learning_rate": 2.9266460035690806e-05, + "loss": 109.969, + "step": 5598, + "task_loss": 3.065769910812378 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.999282432949122, + "compression/movement_sparsity/importance_threshold": -5.02564933962512e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9221763594886612, + "compression/movement_sparsity/model_sparsity": 0.890496764593917, + "compression_loss": 105.60772705078125, + "distillation_loss": 4.642152786254883, + "epoch": 4.73, + "learning_rate": 2.9261763877148496e-05, + "loss": 110.5089, + "step": 5599, + "task_loss": 2.245070219039917 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9992891811841788, + "compression/movement_sparsity/importance_threshold": -4.978386490787391e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9221357219253582, + "compression/movement_sparsity/model_sparsity": 0.8904575230559293, + "compression_loss": 105.60783386230469, + "distillation_loss": 4.128574848175049, + "epoch": 4.73, + "learning_rate": 2.925706771860618e-05, + "loss": 110.2386, + "step": 5600, + "task_loss": 2.39890193939209 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.999295886977646, + "compression/movement_sparsity/importance_threshold": -4.931420891587761e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9222088886179718, + "compression/movement_sparsity/model_sparsity": 0.8905281762475645, + "compression_loss": 105.60791015625, + "distillation_loss": 3.3879923820495605, + "epoch": 4.73, + "learning_rate": 2.925237156006387e-05, + "loss": 109.5162, + "step": 5601, + "task_loss": 3.122440814971924 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9993025504634088, + "compression/movement_sparsity/importance_threshold": -4.884751604331862e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9222581115819727, + "compression/movement_sparsity/model_sparsity": 0.8905757082513244, + "compression_loss": 105.60799407958984, + "distillation_loss": 5.597792625427246, + "epoch": 4.73, + "learning_rate": 2.924767540152156e-05, + "loss": 110.2009, + "step": 5602, + "task_loss": 2.21575665473938 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9993091717753523, + "compression/movement_sparsity/importance_threshold": -4.838377691321859e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9222598286621122, + "compression/movement_sparsity/model_sparsity": 0.8905773663444788, + "compression_loss": 105.6080551147461, + "distillation_loss": 4.800532341003418, + "epoch": 4.74, + "learning_rate": 2.9242979242979245e-05, + "loss": 110.6373, + "step": 5603, + "task_loss": 1.7648063898086548 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9993157510473617, + "compression/movement_sparsity/importance_threshold": -4.79229821486165e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.922251255185582, + "compression/movement_sparsity/model_sparsity": 0.8905690873932426, + "compression_loss": 105.6081314086914, + "distillation_loss": 4.228703022003174, + "epoch": 4.74, + "learning_rate": 2.9238283084436928e-05, + "loss": 110.0539, + "step": 5604, + "task_loss": 1.912705898284912 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.999322288413322, + "compression/movement_sparsity/importance_threshold": -4.746512237256868e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9222495023329396, + "compression/movement_sparsity/model_sparsity": 0.8905673947564807, + "compression_loss": 105.60822296142578, + "distillation_loss": 3.682400703430176, + "epoch": 4.74, + "learning_rate": 2.9233586925894617e-05, + "loss": 109.3762, + "step": 5605, + "task_loss": 2.4366307258605957 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9993287840071184, + "compression/movement_sparsity/importance_threshold": -4.701018820810546e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9222358491609964, + "compression/movement_sparsity/model_sparsity": 0.890554210612996, + "compression_loss": 105.60833740234375, + "distillation_loss": 5.052546977996826, + "epoch": 4.74, + "learning_rate": 2.9228890767352307e-05, + "loss": 110.0106, + "step": 5606, + "task_loss": 3.261329174041748 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9993352379626362, + "compression/movement_sparsity/importance_threshold": -4.655817027826581e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9222561560184803, + "compression/movement_sparsity/model_sparsity": 0.8905738198674541, + "compression_loss": 105.60840606689453, + "distillation_loss": 4.146862983703613, + "epoch": 4.74, + "learning_rate": 2.9224194608809997e-05, + "loss": 109.8905, + "step": 5607, + "task_loss": 2.6811201572418213 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9993416504137603, + "compression/movement_sparsity/importance_threshold": -4.61090592060974e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.922283772390725, + "compression/movement_sparsity/model_sparsity": 0.8906004875323542, + "compression_loss": 105.60856628417969, + "distillation_loss": 4.35052490234375, + "epoch": 4.74, + "learning_rate": 2.9219498450267687e-05, + "loss": 109.8753, + "step": 5608, + "task_loss": 2.6895864009857178 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9993480214943761, + "compression/movement_sparsity/importance_threshold": -4.5662845614630534e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9222717051330775, + "compression/movement_sparsity/model_sparsity": 0.8905888348221301, + "compression_loss": 105.60872650146484, + "distillation_loss": 3.369302272796631, + "epoch": 4.74, + "learning_rate": 2.921480229172537e-05, + "loss": 109.3214, + "step": 5609, + "task_loss": 1.4828367233276367 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9993543513383685, + "compression/movement_sparsity/importance_threshold": -4.521952012691288e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9222965550984307, + "compression/movement_sparsity/model_sparsity": 0.890612831114726, + "compression_loss": 105.60883331298828, + "distillation_loss": 3.7170021533966064, + "epoch": 4.74, + "learning_rate": 2.9210106133183056e-05, + "loss": 109.8172, + "step": 5610, + "task_loss": 2.8559083938598633 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9993606400796227, + "compression/movement_sparsity/importance_threshold": -4.477907336597475e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.922320021860338, + "compression/movement_sparsity/model_sparsity": 0.8906354917211696, + "compression_loss": 105.60894775390625, + "distillation_loss": 4.377997398376465, + "epoch": 4.74, + "learning_rate": 2.9205409974640746e-05, + "loss": 109.2672, + "step": 5611, + "task_loss": 2.5065746307373047 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9993668878520238, + "compression/movement_sparsity/importance_threshold": -4.434149595487248e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9223383850784973, + "compression/movement_sparsity/model_sparsity": 0.8906532241062932, + "compression_loss": 105.6090316772461, + "distillation_loss": 4.252251625061035, + "epoch": 4.74, + "learning_rate": 2.9200713816098435e-05, + "loss": 109.5054, + "step": 5612, + "task_loss": 2.5108642578125 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9993730947894571, + "compression/movement_sparsity/importance_threshold": -4.390677851661903e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9223015632488377, + "compression/movement_sparsity/model_sparsity": 0.8906176672197598, + "compression_loss": 105.6091537475586, + "distillation_loss": 4.167950630187988, + "epoch": 4.74, + "learning_rate": 2.9196017657556118e-05, + "loss": 109.0515, + "step": 5613, + "task_loss": 2.6121675968170166 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9993792610258077, + "compression/movement_sparsity/importance_threshold": -4.3474911674279415e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9222815544955447, + "compression/movement_sparsity/model_sparsity": 0.8905983458286965, + "compression_loss": 105.6092300415039, + "distillation_loss": 4.750764846801758, + "epoch": 4.75, + "learning_rate": 2.9191321499013808e-05, + "loss": 110.1734, + "step": 5614, + "task_loss": 2.9575939178466797 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9993853866949606, + "compression/movement_sparsity/importance_threshold": -4.304588605088394e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9223010505096294, + "compression/movement_sparsity/model_sparsity": 0.8906171720947206, + "compression_loss": 105.6092300415039, + "distillation_loss": 5.412894248962402, + "epoch": 4.75, + "learning_rate": 2.9186625340471498e-05, + "loss": 110.1121, + "step": 5615, + "task_loss": 3.969921827316284 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.999391471930801, + "compression/movement_sparsity/importance_threshold": -4.26196922694716e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9223165280792207, + "compression/movement_sparsity/model_sparsity": 0.8906321179621819, + "compression_loss": 105.60932159423828, + "distillation_loss": 4.211411952972412, + "epoch": 4.75, + "learning_rate": 2.9181929181929184e-05, + "loss": 109.91, + "step": 5616, + "task_loss": 3.04245662689209 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9993975168672142, + "compression/movement_sparsity/importance_threshold": -4.219632095308137e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9222415250647912, + "compression/movement_sparsity/model_sparsity": 0.8905596915320342, + "compression_loss": 105.609375, + "distillation_loss": 3.9418325424194336, + "epoch": 4.75, + "learning_rate": 2.9177233023386867e-05, + "loss": 109.8852, + "step": 5617, + "task_loss": 1.845381736755371 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9994035216380851, + "compression/movement_sparsity/importance_threshold": -4.177576272476093e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9222629050973623, + "compression/movement_sparsity/model_sparsity": 0.8905803370947138, + "compression_loss": 105.60942077636719, + "distillation_loss": 4.406393051147461, + "epoch": 4.75, + "learning_rate": 2.9172536864844557e-05, + "loss": 109.5133, + "step": 5618, + "task_loss": 2.2049105167388916 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9994094863772991, + "compression/movement_sparsity/importance_threshold": -4.13580082075319e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9222959469658812, + "compression/movement_sparsity/model_sparsity": 0.8906122438734004, + "compression_loss": 105.60942840576172, + "distillation_loss": 4.3336358070373535, + "epoch": 4.75, + "learning_rate": 2.9167840706302246e-05, + "loss": 110.2571, + "step": 5619, + "task_loss": 3.0254037380218506 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.999415411218741, + "compression/movement_sparsity/importance_threshold": -4.094304802445929e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9223347243590331, + "compression/movement_sparsity/model_sparsity": 0.8906496891438044, + "compression_loss": 105.60943603515625, + "distillation_loss": 3.093466281890869, + "epoch": 4.75, + "learning_rate": 2.9163144547759936e-05, + "loss": 109.2726, + "step": 5620, + "task_loss": 1.5833241939544678 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9994212962962963, + "compression/movement_sparsity/importance_threshold": -4.053087279856475e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9223158484016655, + "compression/movement_sparsity/model_sparsity": 0.8906314616336416, + "compression_loss": 105.60943603515625, + "distillation_loss": 4.237001895904541, + "epoch": 4.75, + "learning_rate": 2.9158448389217623e-05, + "loss": 109.4637, + "step": 5621, + "task_loss": 1.3900835514068604 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.99942714174385, + "compression/movement_sparsity/importance_threshold": -4.012147315288726e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9223312186537481, + "compression/movement_sparsity/model_sparsity": 0.8906463038702808, + "compression_loss": 105.60940551757812, + "distillation_loss": 3.6893773078918457, + "epoch": 4.75, + "learning_rate": 2.915375223067531e-05, + "loss": 109.5022, + "step": 5622, + "task_loss": 1.9447572231292725 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.999432947695287, + "compression/movement_sparsity/importance_threshold": -3.971483971047449e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9223598247319066, + "compression/movement_sparsity/model_sparsity": 0.8906739272416518, + "compression_loss": 105.60939025878906, + "distillation_loss": 3.966864585876465, + "epoch": 4.75, + "learning_rate": 2.9149056072132995e-05, + "loss": 109.0791, + "step": 5623, + "task_loss": 1.7631661891937256 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9994387142844928, + "compression/movement_sparsity/importance_threshold": -3.931096309436541e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9224275778524136, + "compression/movement_sparsity/model_sparsity": 0.8907393528340364, + "compression_loss": 105.60933685302734, + "distillation_loss": 2.955288887023926, + "epoch": 4.75, + "learning_rate": 2.9144359913590685e-05, + "loss": 109.4623, + "step": 5624, + "task_loss": 1.7654914855957031 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9994444416453524, + "compression/movement_sparsity/importance_threshold": -3.890983392759902e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9224232016828912, + "compression/movement_sparsity/model_sparsity": 0.8907351269993998, + "compression_loss": 105.60929870605469, + "distillation_loss": 5.397034168243408, + "epoch": 4.75, + "learning_rate": 2.9139663755048375e-05, + "loss": 109.9182, + "step": 5625, + "task_loss": 3.168004035949707 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.999450129911751, + "compression/movement_sparsity/importance_threshold": -3.85114428332143e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9223894085918112, + "compression/movement_sparsity/model_sparsity": 0.8907024948049581, + "compression_loss": 105.60926055908203, + "distillation_loss": 4.069596290588379, + "epoch": 4.76, + "learning_rate": 2.9134967596506058e-05, + "loss": 109.1805, + "step": 5626, + "task_loss": 1.802577018737793 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9994557792175736, + "compression/movement_sparsity/importance_threshold": -3.8115780434250235e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9223828741479467, + "compression/movement_sparsity/model_sparsity": 0.8906961848393427, + "compression_loss": 105.60919952392578, + "distillation_loss": 3.475609540939331, + "epoch": 4.76, + "learning_rate": 2.9130271437963747e-05, + "loss": 108.8786, + "step": 5627, + "task_loss": 2.124619722366333 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9994613896967054, + "compression/movement_sparsity/importance_threshold": -3.772283735374582e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9223769716849669, + "compression/movement_sparsity/model_sparsity": 0.8906904851441244, + "compression_loss": 105.60908508300781, + "distillation_loss": 4.698302745819092, + "epoch": 4.76, + "learning_rate": 2.9125575279421434e-05, + "loss": 109.8252, + "step": 5628, + "task_loss": 2.3554258346557617 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9994669614830316, + "compression/movement_sparsity/importance_threshold": -3.733260421474871e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9223554724107195, + "compression/movement_sparsity/model_sparsity": 0.8906697244360868, + "compression_loss": 105.60902404785156, + "distillation_loss": 3.1704049110412598, + "epoch": 4.76, + "learning_rate": 2.9120879120879123e-05, + "loss": 109.6902, + "step": 5629, + "task_loss": 2.841761350631714 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9994724947104373, + "compression/movement_sparsity/importance_threshold": -3.694507164028922e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.922418360470831, + "compression/movement_sparsity/model_sparsity": 0.8907304520978673, + "compression_loss": 105.60891723632812, + "distillation_loss": 5.021284580230713, + "epoch": 4.76, + "learning_rate": 2.9116182962336806e-05, + "loss": 109.8644, + "step": 5630, + "task_loss": 3.831285238265991 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9994779895128076, + "compression/movement_sparsity/importance_threshold": -3.6560230253406337e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9224172038265703, + "compression/movement_sparsity/model_sparsity": 0.8907293351878952, + "compression_loss": 105.60879516601562, + "distillation_loss": 3.7207274436950684, + "epoch": 4.76, + "learning_rate": 2.9111486803794496e-05, + "loss": 109.5091, + "step": 5631, + "task_loss": 1.6434932947158813 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9994834460240277, + "compression/movement_sparsity/importance_threshold": -3.6178070677156393e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9224530836469866, + "compression/movement_sparsity/model_sparsity": 0.890763982426101, + "compression_loss": 105.60871887207031, + "distillation_loss": 4.3512468338012695, + "epoch": 4.76, + "learning_rate": 2.9106790645252186e-05, + "loss": 109.9053, + "step": 5632, + "task_loss": 2.5089259147644043 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9994888643779828, + "compression/movement_sparsity/importance_threshold": -3.5798583534552356e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9223592762201953, + "compression/movement_sparsity/model_sparsity": 0.8906733975730053, + "compression_loss": 105.60857391357422, + "distillation_loss": 3.995251178741455, + "epoch": 4.76, + "learning_rate": 2.9102094486709876e-05, + "loss": 109.3422, + "step": 5633, + "task_loss": 1.972611665725708 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.999494244708558, + "compression/movement_sparsity/importance_threshold": -3.542175944865056e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9224251810947187, + "compression/movement_sparsity/model_sparsity": 0.8907370384123418, + "compression_loss": 105.60847473144531, + "distillation_loss": 3.2171730995178223, + "epoch": 4.76, + "learning_rate": 2.909739832816756e-05, + "loss": 109.2319, + "step": 5634, + "task_loss": 2.8067142963409424 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9994995871496383, + "compression/movement_sparsity/importance_threshold": -3.5047589042489988e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9224059474123221, + "compression/movement_sparsity/model_sparsity": 0.8907184654661051, + "compression_loss": 105.60838317871094, + "distillation_loss": 3.8062636852264404, + "epoch": 4.76, + "learning_rate": 2.9092702169625248e-05, + "loss": 109.7905, + "step": 5635, + "task_loss": 2.176510810852051 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9995048918351089, + "compression/movement_sparsity/importance_threshold": -3.467606293910963e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9225063369796485, + "compression/movement_sparsity/model_sparsity": 0.8908154063429594, + "compression_loss": 105.60826110839844, + "distillation_loss": 4.2388176918029785, + "epoch": 4.76, + "learning_rate": 2.9088006011082935e-05, + "loss": 109.705, + "step": 5636, + "task_loss": 1.9871796369552612 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9995101588988551, + "compression/movement_sparsity/importance_threshold": -3.430717176155715e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9225204194216263, + "compression/movement_sparsity/model_sparsity": 0.8908290050097327, + "compression_loss": 105.60816955566406, + "distillation_loss": 4.142559051513672, + "epoch": 4.76, + "learning_rate": 2.9083309852540624e-05, + "loss": 109.5132, + "step": 5637, + "task_loss": 2.2919771671295166 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9995153884747618, + "compression/movement_sparsity/importance_threshold": -3.394090613285418e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.922522363060951, + "compression/movement_sparsity/model_sparsity": 0.8908308818790672, + "compression_loss": 105.60810852050781, + "distillation_loss": 2.455982208251953, + "epoch": 4.77, + "learning_rate": 2.9078613693998314e-05, + "loss": 108.8454, + "step": 5638, + "task_loss": 1.6038649082183838 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9995205806967143, + "compression/movement_sparsity/importance_threshold": -3.3577256676057057e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9225714071624374, + "compression/movement_sparsity/model_sparsity": 0.8908782411647902, + "compression_loss": 105.60797119140625, + "distillation_loss": 4.4005022048950195, + "epoch": 4.77, + "learning_rate": 2.9073917535455997e-05, + "loss": 109.7628, + "step": 5639, + "task_loss": 2.1704659461975098 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9995257356985978, + "compression/movement_sparsity/importance_threshold": -3.3216214014196097e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9225260237804153, + "compression/movement_sparsity/model_sparsity": 0.8908344168415562, + "compression_loss": 105.60785675048828, + "distillation_loss": 3.815706491470337, + "epoch": 4.77, + "learning_rate": 2.9069221376913687e-05, + "loss": 109.8515, + "step": 5640, + "task_loss": 3.159796953201294 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9995308536142973, + "compression/movement_sparsity/importance_threshold": -3.2857768770310283e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9225654093061164, + "compression/movement_sparsity/model_sparsity": 0.8908724493532855, + "compression_loss": 105.6076889038086, + "distillation_loss": 5.378450870513916, + "epoch": 4.77, + "learning_rate": 2.9064525218371373e-05, + "loss": 110.1261, + "step": 5641, + "task_loss": 2.8019936084747314 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.999535934577698, + "compression/movement_sparsity/importance_threshold": -3.250191156744728e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9226058680069049, + "compression/movement_sparsity/model_sparsity": 0.8909115181732364, + "compression_loss": 105.60759735107422, + "distillation_loss": 2.6440932750701904, + "epoch": 4.77, + "learning_rate": 2.9059829059829063e-05, + "loss": 108.9915, + "step": 5642, + "task_loss": 0.9439173340797424 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.999540978722685, + "compression/movement_sparsity/importance_threshold": -3.2148633028646068e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9226422844148648, + "compression/movement_sparsity/model_sparsity": 0.890946683565553, + "compression_loss": 105.60747528076172, + "distillation_loss": 3.772413730621338, + "epoch": 4.77, + "learning_rate": 2.9055132901286746e-05, + "loss": 110.0079, + "step": 5643, + "task_loss": 2.9333770275115967 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9995459861831435, + "compression/movement_sparsity/importance_threshold": -3.1797923776936965e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.922691793558889, + "compression/movement_sparsity/model_sparsity": 0.8909944919181718, + "compression_loss": 105.60733032226562, + "distillation_loss": 3.9946603775024414, + "epoch": 4.77, + "learning_rate": 2.9050436742744435e-05, + "loss": 109.1117, + "step": 5644, + "task_loss": 3.417046070098877 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9995509570929586, + "compression/movement_sparsity/importance_threshold": -3.1449774435358957e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.922716810462589, + "compression/movement_sparsity/model_sparsity": 0.8910186494142688, + "compression_loss": 105.60714721679688, + "distillation_loss": 3.8972456455230713, + "epoch": 4.77, + "learning_rate": 2.9045740584202125e-05, + "loss": 109.5048, + "step": 5645, + "task_loss": 2.5593321323394775 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9995558915860154, + "compression/movement_sparsity/importance_threshold": -3.1104175626968378e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9227141990698767, + "compression/movement_sparsity/model_sparsity": 0.8910161277309298, + "compression_loss": 105.60697174072266, + "distillation_loss": 3.189892053604126, + "epoch": 4.77, + "learning_rate": 2.9041044425659815e-05, + "loss": 109.4921, + "step": 5646, + "task_loss": 1.401733636856079 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9995607897961991, + "compression/movement_sparsity/importance_threshold": -3.076111797479554e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9227340408848228, + "compression/movement_sparsity/model_sparsity": 0.891035287918492, + "compression_loss": 105.60681915283203, + "distillation_loss": 5.323644638061523, + "epoch": 4.77, + "learning_rate": 2.9036348267117498e-05, + "loss": 109.8583, + "step": 5647, + "task_loss": 2.8699374198913574 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9995656518573949, + "compression/movement_sparsity/importance_threshold": -3.042059210187943e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9227534772780693, + "compression/movement_sparsity/model_sparsity": 0.891054056611837, + "compression_loss": 105.60670471191406, + "distillation_loss": 3.7010111808776855, + "epoch": 4.77, + "learning_rate": 2.9031652108575184e-05, + "loss": 109.1357, + "step": 5648, + "task_loss": 2.7158405780792236 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9995704779034879, + "compression/movement_sparsity/importance_threshold": -3.008258863125904e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9228030937396021, + "compression/movement_sparsity/model_sparsity": 0.8911019685952781, + "compression_loss": 105.6065673828125, + "distillation_loss": 4.858945846557617, + "epoch": 4.77, + "learning_rate": 2.9026955950032874e-05, + "loss": 110.0395, + "step": 5649, + "task_loss": 2.619891881942749 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9995752680683632, + "compression/movement_sparsity/importance_threshold": -2.9747098185973347e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9228590061616467, + "compression/movement_sparsity/model_sparsity": 0.8911559602536187, + "compression_loss": 105.60639190673828, + "distillation_loss": 3.7543773651123047, + "epoch": 4.78, + "learning_rate": 2.9022259791490564e-05, + "loss": 108.954, + "step": 5650, + "task_loss": 1.553162932395935 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9995800224859058, + "compression/movement_sparsity/importance_threshold": -2.941411138907002e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9228392239675388, + "compression/movement_sparsity/model_sparsity": 0.8911368576387355, + "compression_loss": 105.60621643066406, + "distillation_loss": 3.602078914642334, + "epoch": 4.78, + "learning_rate": 2.9017563632948247e-05, + "loss": 109.0845, + "step": 5651, + "task_loss": 1.0530041456222534 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9995847412900012, + "compression/movement_sparsity/importance_threshold": -2.9083618863570693e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9228570267498192, + "compression/movement_sparsity/model_sparsity": 0.8911540488406768, + "compression_loss": 105.60604095458984, + "distillation_loss": 5.1785173416137695, + "epoch": 4.78, + "learning_rate": 2.9012867474405936e-05, + "loss": 109.4983, + "step": 5652, + "task_loss": 2.988839626312256 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9995894246145341, + "compression/movement_sparsity/importance_threshold": -2.8755611232540376e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9228679731357089, + "compression/movement_sparsity/model_sparsity": 0.8911646191845362, + "compression_loss": 105.60591125488281, + "distillation_loss": 3.4887447357177734, + "epoch": 4.78, + "learning_rate": 2.9008171315863626e-05, + "loss": 109.1467, + "step": 5653, + "task_loss": 2.938291549682617 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9995940725933901, + "compression/movement_sparsity/importance_threshold": -2.843007911900071e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9228560847405759, + "compression/movement_sparsity/model_sparsity": 0.8911531391923491, + "compression_loss": 105.6057357788086, + "distillation_loss": 4.141871452331543, + "epoch": 4.78, + "learning_rate": 2.9003475157321312e-05, + "loss": 109.6863, + "step": 5654, + "task_loss": 1.7518213987350464 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.999598685360454, + "compression/movement_sparsity/importance_threshold": -2.810701314599935e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9229157890479287, + "compression/movement_sparsity/model_sparsity": 0.8912107924730723, + "compression_loss": 105.60563659667969, + "distillation_loss": 5.261333465576172, + "epoch": 4.78, + "learning_rate": 2.8998778998779002e-05, + "loss": 110.303, + "step": 5655, + "task_loss": 2.5550904273986816 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9996032630496111, + "compression/movement_sparsity/importance_threshold": -2.7786403936566617e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9229129153235284, + "compression/movement_sparsity/model_sparsity": 0.8912080174699458, + "compression_loss": 105.60546875, + "distillation_loss": 4.363441467285156, + "epoch": 4.78, + "learning_rate": 2.8994082840236685e-05, + "loss": 109.4663, + "step": 5656, + "task_loss": 2.0265538692474365 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9996078057947465, + "compression/movement_sparsity/importance_threshold": -2.746824211375884e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.922955663464503, + "compression/movement_sparsity/model_sparsity": 0.8912492970807693, + "compression_loss": 105.6053237915039, + "distillation_loss": 3.6328392028808594, + "epoch": 4.78, + "learning_rate": 2.8989386681694375e-05, + "loss": 109.3796, + "step": 5657, + "task_loss": 2.4226694107055664 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9996123137297453, + "compression/movement_sparsity/importance_threshold": -2.715251830059766e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9229369305971471, + "compression/movement_sparsity/model_sparsity": 0.8912312077450361, + "compression_loss": 105.60517883300781, + "distillation_loss": 3.928009033203125, + "epoch": 4.78, + "learning_rate": 2.8984690523152065e-05, + "loss": 109.6372, + "step": 5658, + "task_loss": 2.5767695903778076 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9996167869884928, + "compression/movement_sparsity/importance_threshold": -2.6839223120139408e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9229736570334655, + "compression/movement_sparsity/model_sparsity": 0.8912666725152832, + "compression_loss": 105.60507202148438, + "distillation_loss": 4.8014140129089355, + "epoch": 4.78, + "learning_rate": 2.8979994364609754e-05, + "loss": 109.2881, + "step": 5659, + "task_loss": 2.319236993789673 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9996212257048739, + "compression/movement_sparsity/importance_threshold": -2.6528347195405727e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9230042067509486, + "compression/movement_sparsity/model_sparsity": 0.8912961727559888, + "compression_loss": 105.60497283935547, + "distillation_loss": 3.252730131149292, + "epoch": 4.78, + "learning_rate": 2.8975298206067437e-05, + "loss": 108.8667, + "step": 5660, + "task_loss": 1.0513325929641724 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9996256300127739, + "compression/movement_sparsity/importance_threshold": -2.621988114945295e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9230470383610966, + "compression/movement_sparsity/model_sparsity": 0.8913375329685628, + "compression_loss": 105.60482788085938, + "distillation_loss": 3.057798385620117, + "epoch": 4.78, + "learning_rate": 2.8970602047525124e-05, + "loss": 109.0069, + "step": 5661, + "task_loss": 1.599597454071045 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9996300000460779, + "compression/movement_sparsity/importance_threshold": -2.591381560531139e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9230470979819347, + "compression/movement_sparsity/model_sparsity": 0.8913375905412418, + "compression_loss": 105.60467529296875, + "distillation_loss": 4.240011215209961, + "epoch": 4.79, + "learning_rate": 2.8965905888982813e-05, + "loss": 109.9488, + "step": 5662, + "task_loss": 1.789961814880371 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.999634335938671, + "compression/movement_sparsity/importance_threshold": -2.561014118602871e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9230878905594171, + "compression/movement_sparsity/model_sparsity": 0.8913769817681949, + "compression_loss": 105.60453796386719, + "distillation_loss": 4.366610050201416, + "epoch": 4.79, + "learning_rate": 2.8961209730440503e-05, + "loss": 109.4668, + "step": 5663, + "task_loss": 1.924889087677002 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9996386378244385, + "compression/movement_sparsity/importance_threshold": -2.530884851463522e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9230724010656581, + "compression/movement_sparsity/model_sparsity": 0.8913620243861978, + "compression_loss": 105.60440826416016, + "distillation_loss": 4.690059661865234, + "epoch": 4.79, + "learning_rate": 2.8956513571898186e-05, + "loss": 110.0697, + "step": 5664, + "task_loss": 2.761455774307251 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9996429058372653, + "compression/movement_sparsity/importance_threshold": -2.500992821417858e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9230787327986727, + "compression/movement_sparsity/model_sparsity": 0.8913681386047048, + "compression_loss": 105.60428619384766, + "distillation_loss": 2.7741312980651855, + "epoch": 4.79, + "learning_rate": 2.8951817413355876e-05, + "loss": 109.5664, + "step": 5665, + "task_loss": 1.9377176761627197 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9996471401110367, + "compression/movement_sparsity/importance_threshold": -2.4713370907689103e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9230731642123868, + "compression/movement_sparsity/model_sparsity": 0.8913627613164886, + "compression_loss": 105.60414123535156, + "distillation_loss": 4.983858585357666, + "epoch": 4.79, + "learning_rate": 2.8947121254813565e-05, + "loss": 109.6826, + "step": 5666, + "task_loss": 2.3862502574920654 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9996513407796378, + "compression/movement_sparsity/importance_threshold": -2.4419167218223126e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9231099741178787, + "compression/movement_sparsity/model_sparsity": 0.8913983066884864, + "compression_loss": 105.60397338867188, + "distillation_loss": 3.4344563484191895, + "epoch": 4.79, + "learning_rate": 2.8942425096271252e-05, + "loss": 109.6671, + "step": 5667, + "task_loss": 1.5471762418746948 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9996555079769538, + "compression/movement_sparsity/importance_threshold": -2.4127307768802286e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.923103546991523, + "compression/movement_sparsity/model_sparsity": 0.8913921003536931, + "compression_loss": 105.60382843017578, + "distillation_loss": 5.558966636657715, + "epoch": 4.79, + "learning_rate": 2.893772893772894e-05, + "loss": 109.8489, + "step": 5668, + "task_loss": 2.1992363929748535 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9996596418368697, + "compression/movement_sparsity/importance_threshold": -2.3837783182474245e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9231275861134769, + "compression/movement_sparsity/model_sparsity": 0.891415313657855, + "compression_loss": 105.60367584228516, + "distillation_loss": 4.293231010437012, + "epoch": 4.79, + "learning_rate": 2.8933032779186624e-05, + "loss": 109.7502, + "step": 5669, + "task_loss": 2.4795405864715576 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9996637424932708, + "compression/movement_sparsity/importance_threshold": -2.355058408227799e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.923124175801533, + "compression/movement_sparsity/model_sparsity": 0.8914120205006177, + "compression_loss": 105.60350799560547, + "distillation_loss": 4.44014310836792, + "epoch": 4.79, + "learning_rate": 2.8928336620644314e-05, + "loss": 109.7427, + "step": 5670, + "task_loss": 2.3470048904418945 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9996678100800421, + "compression/movement_sparsity/importance_threshold": -2.326570109126118e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9230924694397893, + "compression/movement_sparsity/model_sparsity": 0.89138140334994, + "compression_loss": 105.60332489013672, + "distillation_loss": 4.180745601654053, + "epoch": 4.79, + "learning_rate": 2.8923640462102004e-05, + "loss": 109.6855, + "step": 5671, + "task_loss": 3.089413642883301 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9996718447310688, + "compression/movement_sparsity/importance_threshold": -2.2983124832454127e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9231926205237628, + "compression/movement_sparsity/model_sparsity": 0.8914781139360783, + "compression_loss": 105.60311126708984, + "distillation_loss": 2.157921075820923, + "epoch": 4.79, + "learning_rate": 2.891894430355969e-05, + "loss": 109.0114, + "step": 5672, + "task_loss": 2.1778717041015625 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9996758465802362, + "compression/movement_sparsity/importance_threshold": -2.2702845928887147e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9232463985198006, + "compression/movement_sparsity/model_sparsity": 0.8915300444925117, + "compression_loss": 105.6029052734375, + "distillation_loss": 3.26692533493042, + "epoch": 4.79, + "learning_rate": 2.8914248145017377e-05, + "loss": 109.3564, + "step": 5673, + "task_loss": 2.509411334991455 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9996798157614292, + "compression/movement_sparsity/importance_threshold": -2.2424855003625246e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9232135355137961, + "compression/movement_sparsity/model_sparsity": 0.891498310431862, + "compression_loss": 105.6026840209961, + "distillation_loss": 4.3126020431518555, + "epoch": 4.8, + "learning_rate": 2.8909551986475063e-05, + "loss": 109.4038, + "step": 5674, + "task_loss": 1.9650356769561768 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.999683752408533, + "compression/movement_sparsity/importance_threshold": -2.2149142679690065e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9231845717106086, + "compression/movement_sparsity/model_sparsity": 0.8914703416244171, + "compression_loss": 105.6025161743164, + "distillation_loss": 4.245550632476807, + "epoch": 4.8, + "learning_rate": 2.8904855827932753e-05, + "loss": 109.6315, + "step": 5675, + "task_loss": 3.0455315113067627 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9996876566554328, + "compression/movement_sparsity/importance_threshold": -2.1875699580129263e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9232408657060176, + "compression/movement_sparsity/model_sparsity": 0.8915247017479031, + "compression_loss": 105.60231018066406, + "distillation_loss": 3.9444079399108887, + "epoch": 4.8, + "learning_rate": 2.8900159669390442e-05, + "loss": 110.2266, + "step": 5676, + "task_loss": 1.301820993423462 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9996915286360136, + "compression/movement_sparsity/importance_threshold": -2.1604516327981826e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9232746707212652, + "compression/movement_sparsity/model_sparsity": 0.8915573454568806, + "compression_loss": 105.60211181640625, + "distillation_loss": 4.091458797454834, + "epoch": 4.8, + "learning_rate": 2.8895463510848125e-05, + "loss": 109.4986, + "step": 5677, + "task_loss": 2.6463732719421387 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9996953684841609, + "compression/movement_sparsity/importance_threshold": -2.133558354627807e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9232628061744675, + "compression/movement_sparsity/model_sparsity": 0.891545888493765, + "compression_loss": 105.60187530517578, + "distillation_loss": 3.783182144165039, + "epoch": 4.8, + "learning_rate": 2.8890767352305815e-05, + "loss": 109.7105, + "step": 5678, + "task_loss": 2.8817298412323 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9996991763337594, + "compression/movement_sparsity/importance_threshold": -2.1068891858065653e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9232313382960765, + "compression/movement_sparsity/model_sparsity": 0.8915155016338032, + "compression_loss": 105.60169982910156, + "distillation_loss": 3.3853981494903564, + "epoch": 4.8, + "learning_rate": 2.8886071193763505e-05, + "loss": 108.9397, + "step": 5679, + "task_loss": 1.9737993478775024 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9997029523186944, + "compression/movement_sparsity/importance_threshold": -2.080443188638356e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9231854302506785, + "compression/movement_sparsity/model_sparsity": 0.8914711706709942, + "compression_loss": 105.60153198242188, + "distillation_loss": 4.332512855529785, + "epoch": 4.8, + "learning_rate": 2.888137503522119e-05, + "loss": 109.5464, + "step": 5680, + "task_loss": 2.8926968574523926 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9997066965728512, + "compression/movement_sparsity/importance_threshold": -2.054219425427946e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9231700480744282, + "compression/movement_sparsity/model_sparsity": 0.8914563169198193, + "compression_loss": 105.60138702392578, + "distillation_loss": 3.275078773498535, + "epoch": 4.8, + "learning_rate": 2.8876678876678874e-05, + "loss": 109.4918, + "step": 5681, + "task_loss": 2.2673392295837402 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9997104092301147, + "compression/movement_sparsity/importance_threshold": -2.0282169584783655e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9231723852112848, + "compression/movement_sparsity/model_sparsity": 0.8914585737688351, + "compression_loss": 105.60120391845703, + "distillation_loss": 5.186788082122803, + "epoch": 4.8, + "learning_rate": 2.8871982718136564e-05, + "loss": 109.7595, + "step": 5682, + "task_loss": 2.766563653945923 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9997140904243702, + "compression/movement_sparsity/importance_threshold": -2.0024348500926464e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9231985587592455, + "compression/movement_sparsity/model_sparsity": 0.891483848174904, + "compression_loss": 105.6009750366211, + "distillation_loss": 3.2706546783447266, + "epoch": 4.8, + "learning_rate": 2.8867286559594254e-05, + "loss": 109.1863, + "step": 5683, + "task_loss": 1.5364062786102295 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9997177402895029, + "compression/movement_sparsity/importance_threshold": -1.976872162576422e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9232445621979847, + "compression/movement_sparsity/model_sparsity": 0.8915282712539994, + "compression_loss": 105.60076141357422, + "distillation_loss": 5.436102867126465, + "epoch": 4.8, + "learning_rate": 2.8862590401051943e-05, + "loss": 109.3955, + "step": 5684, + "task_loss": 2.8402621746063232 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9997213589593976, + "compression/movement_sparsity/importance_threshold": -1.951527958233591e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.923241473838567, + "compression/movement_sparsity/model_sparsity": 0.8915252889892286, + "compression_loss": 105.6005630493164, + "distillation_loss": 4.29775857925415, + "epoch": 4.81, + "learning_rate": 2.885789424250963e-05, + "loss": 109.7468, + "step": 5685, + "task_loss": 2.6675591468811035 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9997249465679399, + "compression/movement_sparsity/importance_threshold": -1.926401299367185e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9233000095774915, + "compression/movement_sparsity/model_sparsity": 0.891581813845444, + "compression_loss": 105.60035705566406, + "distillation_loss": 4.684015274047852, + "epoch": 4.81, + "learning_rate": 2.8853198083967316e-05, + "loss": 109.2161, + "step": 5686, + "task_loss": 2.2515711784362793 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9997285032490146, + "compression/movement_sparsity/importance_threshold": -1.9014912482819696e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9233031217852444, + "compression/movement_sparsity/model_sparsity": 0.8915848191392863, + "compression_loss": 105.60009765625, + "distillation_loss": 4.124024868011475, + "epoch": 4.81, + "learning_rate": 2.8848501925425002e-05, + "loss": 109.3567, + "step": 5687, + "task_loss": 2.2457985877990723 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.999732029136507, + "compression/movement_sparsity/importance_threshold": -1.8767968672818436e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9233214969275713, + "compression/movement_sparsity/model_sparsity": 0.8916025630389458, + "compression_loss": 105.59989929199219, + "distillation_loss": 5.2724103927612305, + "epoch": 4.81, + "learning_rate": 2.8843805766882692e-05, + "loss": 109.9345, + "step": 5688, + "task_loss": 2.300959348678589 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9997355243643022, + "compression/movement_sparsity/importance_threshold": -1.8523172186698383e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9233385365631229, + "compression/movement_sparsity/model_sparsity": 0.8916190173105961, + "compression_loss": 105.5996322631836, + "distillation_loss": 4.746391296386719, + "epoch": 4.81, + "learning_rate": 2.8839109608340382e-05, + "loss": 109.7073, + "step": 5689, + "task_loss": 4.34242582321167 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9997389890662853, + "compression/movement_sparsity/importance_threshold": -1.8280513647515872e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9233359251704106, + "compression/movement_sparsity/model_sparsity": 0.8916164956272572, + "compression_loss": 105.59937286376953, + "distillation_loss": 2.142906427383423, + "epoch": 4.81, + "learning_rate": 2.8834413449798065e-05, + "loss": 108.9779, + "step": 5690, + "task_loss": 1.1718320846557617 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9997424233763416, + "compression/movement_sparsity/importance_threshold": -1.8039983678292543e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9233293549540433, + "compression/movement_sparsity/model_sparsity": 0.8916101511180343, + "compression_loss": 105.59909057617188, + "distillation_loss": 2.425784111022949, + "epoch": 4.81, + "learning_rate": 2.8829717291255754e-05, + "loss": 108.8178, + "step": 5691, + "task_loss": 1.6886776685714722 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9997458274283559, + "compression/movement_sparsity/importance_threshold": -1.7801572902084728e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9233323002234494, + "compression/movement_sparsity/model_sparsity": 0.8916129952083757, + "compression_loss": 105.59879302978516, + "distillation_loss": 4.4191155433654785, + "epoch": 4.81, + "learning_rate": 2.882502113271344e-05, + "loss": 110.2999, + "step": 5692, + "task_loss": 3.944244861602783 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9997492013562138, + "compression/movement_sparsity/importance_threshold": -1.7565271941914068e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9233687404797446, + "compression/movement_sparsity/model_sparsity": 0.8916481836297637, + "compression_loss": 105.59849548339844, + "distillation_loss": 4.312252044677734, + "epoch": 4.81, + "learning_rate": 2.882032497417113e-05, + "loss": 109.5661, + "step": 5693, + "task_loss": 2.179814100265503 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9997525452938001, + "compression/movement_sparsity/importance_threshold": -1.7331071420836897e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.92338742565043, + "compression/movement_sparsity/model_sparsity": 0.8916662269073538, + "compression_loss": 105.59819793701172, + "distillation_loss": 3.828185796737671, + "epoch": 4.81, + "learning_rate": 2.8815628815628813e-05, + "loss": 109.5124, + "step": 5694, + "task_loss": 2.3063619136810303 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.999755859375, + "compression/movement_sparsity/importance_threshold": -1.70989619618922e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9233983124154815, + "compression/movement_sparsity/model_sparsity": 0.8916767396785342, + "compression_loss": 105.59794616699219, + "distillation_loss": 4.6303181648254395, + "epoch": 4.81, + "learning_rate": 2.8810932657086503e-05, + "loss": 109.4474, + "step": 5695, + "task_loss": 2.97469162940979 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9997591437336987, + "compression/movement_sparsity/importance_threshold": -1.6868934188110293e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9233784348280325, + "compression/movement_sparsity/model_sparsity": 0.8916575449473647, + "compression_loss": 105.5976333618164, + "distillation_loss": 3.579273223876953, + "epoch": 4.81, + "learning_rate": 2.8806236498544193e-05, + "loss": 108.9941, + "step": 5696, + "task_loss": 1.6963462829589844 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9997623985037813, + "compression/movement_sparsity/importance_threshold": -1.6640978722538835e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9233599762165322, + "compression/movement_sparsity/model_sparsity": 0.8916397204459547, + "compression_loss": 105.5972900390625, + "distillation_loss": 3.6290738582611084, + "epoch": 4.82, + "learning_rate": 2.8801540340001883e-05, + "loss": 109.1993, + "step": 5697, + "task_loss": 2.6862573623657227 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.999765623819133, + "compression/movement_sparsity/importance_threshold": -1.6415086188208139e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9233822982583465, + "compression/movement_sparsity/model_sparsity": 0.8916612756569622, + "compression_loss": 105.59703826904297, + "distillation_loss": 3.9492249488830566, + "epoch": 4.82, + "learning_rate": 2.879684418145957e-05, + "loss": 109.9019, + "step": 5698, + "task_loss": 2.2623655796051025 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.999768819813639, + "compression/movement_sparsity/importance_threshold": -1.6191247208165865e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.92339423435015, + "compression/movement_sparsity/model_sparsity": 0.8916728017072925, + "compression_loss": 105.59677124023438, + "distillation_loss": 4.307875633239746, + "epoch": 4.82, + "learning_rate": 2.8792148022917252e-05, + "loss": 109.6528, + "step": 5699, + "task_loss": 2.394026517868042 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9997719866211843, + "compression/movement_sparsity/importance_threshold": -1.5969452405442328e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9233531675168121, + "compression/movement_sparsity/model_sparsity": 0.891633145646016, + "compression_loss": 105.59647369384766, + "distillation_loss": 3.448768138885498, + "epoch": 4.82, + "learning_rate": 2.878745186437494e-05, + "loss": 109.0861, + "step": 5700, + "task_loss": 1.5466502904891968 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.999775124375654, + "compression/movement_sparsity/importance_threshold": -1.574969240309386e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9233958560369485, + "compression/movement_sparsity/model_sparsity": 0.8916743676841605, + "compression_loss": 105.59616088867188, + "distillation_loss": 5.1056108474731445, + "epoch": 4.82, + "learning_rate": 2.878275570583263e-05, + "loss": 109.8351, + "step": 5701, + "task_loss": 1.883887529373169 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9997782332109333, + "compression/movement_sparsity/importance_threshold": -1.5531957824150774e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9234229358216496, + "compression/movement_sparsity/model_sparsity": 0.8917005171949499, + "compression_loss": 105.5959243774414, + "distillation_loss": 4.011333465576172, + "epoch": 4.82, + "learning_rate": 2.877805954729032e-05, + "loss": 109.2304, + "step": 5702, + "task_loss": 1.9362305402755737 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9997813132609075, + "compression/movement_sparsity/importance_threshold": -1.531623929165206e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9233343988769532, + "compression/movement_sparsity/model_sparsity": 0.8916150217666755, + "compression_loss": 105.5956039428711, + "distillation_loss": 2.5746169090270996, + "epoch": 4.82, + "learning_rate": 2.8773363388748004e-05, + "loss": 108.899, + "step": 5703, + "task_loss": 2.109720468521118 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9997843646594615, + "compression/movement_sparsity/importance_threshold": -1.51025274286367e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9233859351294754, + "compression/movement_sparsity/model_sparsity": 0.8916647875903795, + "compression_loss": 105.59537506103516, + "distillation_loss": 4.674323558807373, + "epoch": 4.82, + "learning_rate": 2.8768667230205694e-05, + "loss": 109.8116, + "step": 5704, + "task_loss": 2.632923126220703 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9997873875404806, + "compression/movement_sparsity/importance_threshold": -1.4890812858152358e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9233510330908054, + "compression/movement_sparsity/model_sparsity": 0.8916310845441089, + "compression_loss": 105.59503173828125, + "distillation_loss": 3.91969633102417, + "epoch": 4.82, + "learning_rate": 2.876397107166338e-05, + "loss": 109.6389, + "step": 5705, + "task_loss": 2.8761422634124756 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9997903820378499, + "compression/movement_sparsity/importance_threshold": -1.4681086203229346e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9233411479558352, + "compression/movement_sparsity/model_sparsity": 0.8916215389939351, + "compression_loss": 105.5947265625, + "distillation_loss": 3.846813201904297, + "epoch": 4.82, + "learning_rate": 2.875927491312107e-05, + "loss": 109.6477, + "step": 5706, + "task_loss": 2.1879689693450928 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9997933482854545, + "compression/movement_sparsity/importance_threshold": -1.447333808690665e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9233681919680333, + "compression/movement_sparsity/model_sparsity": 0.8916476539611172, + "compression_loss": 105.59439086914062, + "distillation_loss": 4.152050018310547, + "epoch": 4.82, + "learning_rate": 2.8754578754578753e-05, + "loss": 109.2326, + "step": 5707, + "task_loss": 2.0574676990509033 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9997962864171795, + "compression/movement_sparsity/importance_threshold": -1.4267559132231933e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9234565619743826, + "compression/movement_sparsity/model_sparsity": 0.8917329881858905, + "compression_loss": 105.59405517578125, + "distillation_loss": 4.01668643951416, + "epoch": 4.82, + "learning_rate": 2.8749882596036443e-05, + "loss": 109.5796, + "step": 5708, + "task_loss": 2.178811550140381 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9997991965669102, + "compression/movement_sparsity/importance_threshold": -1.4063739962244179e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9234674964361048, + "compression/movement_sparsity/model_sparsity": 0.8917435470152141, + "compression_loss": 105.59379577636719, + "distillation_loss": 3.8174896240234375, + "epoch": 4.83, + "learning_rate": 2.8745186437494132e-05, + "loss": 109.4665, + "step": 5709, + "task_loss": 1.850546956062317 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9998020788685317, + "compression/movement_sparsity/importance_threshold": -1.3861871199973702e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.923479539845417, + "compression/movement_sparsity/model_sparsity": 0.8917551766963666, + "compression_loss": 105.59341430664062, + "distillation_loss": 3.6967933177948, + "epoch": 4.83, + "learning_rate": 2.8740490278951822e-05, + "loss": 109.4936, + "step": 5710, + "task_loss": 2.026614189147949 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.999804933455929, + "compression/movement_sparsity/importance_threshold": -1.3661943468468163e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9234189531496592, + "compression/movement_sparsity/model_sparsity": 0.8916966713399945, + "compression_loss": 105.5931396484375, + "distillation_loss": 4.380904674530029, + "epoch": 4.83, + "learning_rate": 2.8735794120409505e-05, + "loss": 110.4106, + "step": 5711, + "task_loss": 2.534013032913208 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9998077604629874, + "compression/movement_sparsity/importance_threshold": -1.3463947390775222e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9234079948396018, + "compression/movement_sparsity/model_sparsity": 0.8916860894815993, + "compression_loss": 105.5927963256836, + "distillation_loss": 4.466061592102051, + "epoch": 4.83, + "learning_rate": 2.873109796186719e-05, + "loss": 109.6774, + "step": 5712, + "task_loss": 2.956721067428589 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9998105600235919, + "compression/movement_sparsity/importance_threshold": -1.3267873589916518e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9234230312149907, + "compression/movement_sparsity/model_sparsity": 0.8917006093112363, + "compression_loss": 105.59247589111328, + "distillation_loss": 3.1775381565093994, + "epoch": 4.83, + "learning_rate": 2.872640180332488e-05, + "loss": 109.0873, + "step": 5713, + "task_loss": 2.629027843475342 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9998133322716277, + "compression/movement_sparsity/importance_threshold": -1.3073712688948386e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9234189769979945, + "compression/movement_sparsity/model_sparsity": 0.8916966943690662, + "compression_loss": 105.59213256835938, + "distillation_loss": 4.9218292236328125, + "epoch": 4.83, + "learning_rate": 2.872170564478257e-05, + "loss": 109.573, + "step": 5714, + "task_loss": 3.185810089111328 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.99981607734098, + "compression/movement_sparsity/importance_threshold": -1.2881455310901138e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9234805176271631, + "compression/movement_sparsity/model_sparsity": 0.8917561208883018, + "compression_loss": 105.59181213378906, + "distillation_loss": 4.257322311401367, + "epoch": 4.83, + "learning_rate": 2.871700948624026e-05, + "loss": 109.1893, + "step": 5715, + "task_loss": 2.3104312419891357 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.999818795365534, + "compression/movement_sparsity/importance_threshold": -1.2691092078813762e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9234896396154045, + "compression/movement_sparsity/model_sparsity": 0.8917649295081845, + "compression_loss": 105.59149932861328, + "distillation_loss": 4.190281867980957, + "epoch": 4.83, + "learning_rate": 2.8712313327697943e-05, + "loss": 109.5848, + "step": 5716, + "task_loss": 2.6030220985412598 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9998214864791746, + "compression/movement_sparsity/importance_threshold": -1.2502613615725244e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.923462512134033, + "compression/movement_sparsity/model_sparsity": 0.891738733939252, + "compression_loss": 105.59109497070312, + "distillation_loss": 3.924710273742676, + "epoch": 4.83, + "learning_rate": 2.8707617169155633e-05, + "loss": 110.1064, + "step": 5717, + "task_loss": 2.252213716506958 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9998241508157871, + "compression/movement_sparsity/importance_threshold": -1.2316010544683245e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9234779420069538, + "compression/movement_sparsity/model_sparsity": 0.8917536337485701, + "compression_loss": 105.5907974243164, + "distillation_loss": 4.670319557189941, + "epoch": 4.83, + "learning_rate": 2.870292101061332e-05, + "loss": 109.066, + "step": 5718, + "task_loss": 2.2773499488830566 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9998267885092567, + "compression/movement_sparsity/importance_threshold": -1.2131273488726751e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9235022315364281, + "compression/movement_sparsity/model_sparsity": 0.8917770888579836, + "compression_loss": 105.59042358398438, + "distillation_loss": 4.5051374435424805, + "epoch": 4.83, + "learning_rate": 2.869822485207101e-05, + "loss": 108.9894, + "step": 5719, + "task_loss": 2.8995678424835205 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9998293996934683, + "compression/movement_sparsity/importance_threshold": -1.1948393070886076e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9235205709062521, + "compression/movement_sparsity/model_sparsity": 0.8917947982140356, + "compression_loss": 105.59002685546875, + "distillation_loss": 4.603326320648193, + "epoch": 4.83, + "learning_rate": 2.8693528693528692e-05, + "loss": 110.2865, + "step": 5720, + "task_loss": 2.6093969345092773 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9998319845023073, + "compression/movement_sparsity/importance_threshold": -1.1767359914217554e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9235696388560736, + "compression/movement_sparsity/model_sparsity": 0.8918421805288301, + "compression_loss": 105.5896987915039, + "distillation_loss": 2.309217929840088, + "epoch": 4.84, + "learning_rate": 2.8688832534986382e-05, + "loss": 108.5457, + "step": 5721, + "task_loss": 1.401188850402832 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9998345430696587, + "compression/movement_sparsity/importance_threshold": -1.1588164641742824e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9235923305470847, + "compression/movement_sparsity/model_sparsity": 0.8918640926904472, + "compression_loss": 105.58939361572266, + "distillation_loss": 3.5453619956970215, + "epoch": 4.84, + "learning_rate": 2.868413637644407e-05, + "loss": 109.3182, + "step": 5722, + "task_loss": 1.51968514919281 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9998370755294077, + "compression/movement_sparsity/importance_threshold": -1.1410797876509546e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9236255035814476, + "compression/movement_sparsity/model_sparsity": 0.8918961261290276, + "compression_loss": 105.58903503417969, + "distillation_loss": 4.044450759887695, + "epoch": 4.84, + "learning_rate": 2.8679440217901758e-05, + "loss": 108.8695, + "step": 5723, + "task_loss": 1.8900260925292969 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9998395820154394, + "compression/movement_sparsity/importance_threshold": -1.1235250241556707e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9236886658974147, + "compression/movement_sparsity/model_sparsity": 0.8919571186251313, + "compression_loss": 105.58867645263672, + "distillation_loss": 4.239625453948975, + "epoch": 4.84, + "learning_rate": 2.8674744059359444e-05, + "loss": 110.0528, + "step": 5724, + "task_loss": 2.970317840576172 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.999842062661639, + "compression/movement_sparsity/importance_threshold": -1.1061512359923295e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9237127407918716, + "compression/movement_sparsity/model_sparsity": 0.8919803664729005, + "compression_loss": 105.58840942382812, + "distillation_loss": 2.983698844909668, + "epoch": 4.84, + "learning_rate": 2.867004790081713e-05, + "loss": 108.9478, + "step": 5725, + "task_loss": 1.7866815328598022 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9998445176018915, + "compression/movement_sparsity/importance_threshold": -1.088957485465697e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9237099624608124, + "compression/movement_sparsity/model_sparsity": 0.8919776835860603, + "compression_loss": 105.58805084228516, + "distillation_loss": 4.215619087219238, + "epoch": 4.84, + "learning_rate": 2.866535174227482e-05, + "loss": 109.5378, + "step": 5726, + "task_loss": 2.1453146934509277 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9998469469700823, + "compression/movement_sparsity/importance_threshold": -1.0719428348788043e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9236950453271, + "compression/movement_sparsity/model_sparsity": 0.8919632789017814, + "compression_loss": 105.5876693725586, + "distillation_loss": 3.587468385696411, + "epoch": 4.84, + "learning_rate": 2.866065558373251e-05, + "loss": 109.0754, + "step": 5727, + "task_loss": 1.9469712972640991 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9998493509000963, + "compression/movement_sparsity/importance_threshold": -1.0551063465355504e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9236696945467061, + "compression/movement_sparsity/model_sparsity": 0.8919387989986821, + "compression_loss": 105.58728790283203, + "distillation_loss": 3.4647626876831055, + "epoch": 4.84, + "learning_rate": 2.8655959425190193e-05, + "loss": 108.9996, + "step": 5728, + "task_loss": 3.018655300140381 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9998517295258187, + "compression/movement_sparsity/importance_threshold": -1.038447082740701e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9236950334029324, + "compression/movement_sparsity/model_sparsity": 0.8919632673872455, + "compression_loss": 105.58688354492188, + "distillation_loss": 3.6663095951080322, + "epoch": 4.84, + "learning_rate": 2.8651263266647883e-05, + "loss": 109.9913, + "step": 5729, + "task_loss": 1.7663112878799438 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9998540829811348, + "compression/movement_sparsity/importance_threshold": -1.0219641057972878e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9236831927044699, + "compression/movement_sparsity/model_sparsity": 0.8919518334532016, + "compression_loss": 105.5865249633789, + "distillation_loss": 6.020182132720947, + "epoch": 4.84, + "learning_rate": 2.8646567108105572e-05, + "loss": 109.551, + "step": 5730, + "task_loss": 3.4187960624694824 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9998564113999294, + "compression/movement_sparsity/importance_threshold": -1.005656478010944e-06, + "compression/movement_sparsity/linear_layer_sparsity": 0.9236237268804699, + "compression/movement_sparsity/model_sparsity": 0.8918944104631942, + "compression_loss": 105.58613586425781, + "distillation_loss": 3.071834087371826, + "epoch": 4.84, + "learning_rate": 2.864187094956326e-05, + "loss": 109.2762, + "step": 5731, + "task_loss": 2.4498469829559326 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9998587149160879, + "compression/movement_sparsity/importance_threshold": -9.895232616838334e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9236344467071745, + "compression/movement_sparsity/model_sparsity": 0.8919047620308734, + "compression_loss": 105.58580780029297, + "distillation_loss": 4.7092390060424805, + "epoch": 4.84, + "learning_rate": 2.863717479102095e-05, + "loss": 109.4559, + "step": 5732, + "task_loss": 2.411214828491211 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9998609936634955, + "compression/movement_sparsity/importance_threshold": -9.735635191207223e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9236509855276854, + "compression/movement_sparsity/model_sparsity": 0.8919207326920205, + "compression_loss": 105.58537292480469, + "distillation_loss": 4.833512306213379, + "epoch": 4.85, + "learning_rate": 2.863247863247863e-05, + "loss": 109.6277, + "step": 5733, + "task_loss": 3.463469982147217 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9998632477760372, + "compression/movement_sparsity/importance_threshold": -9.57776312624642e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9236685498266131, + "compression/movement_sparsity/model_sparsity": 0.8919376936032459, + "compression_loss": 105.58502197265625, + "distillation_loss": 3.6072967052459717, + "epoch": 4.85, + "learning_rate": 2.862778247393632e-05, + "loss": 109.213, + "step": 5734, + "task_loss": 3.7080564498901367 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.999865477387598, + "compression/movement_sparsity/importance_threshold": -9.42160704502093e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9236592370516895, + "compression/movement_sparsity/model_sparsity": 0.8919287007507903, + "compression_loss": 105.58460998535156, + "distillation_loss": 4.377493858337402, + "epoch": 4.85, + "learning_rate": 2.862308631539401e-05, + "loss": 110.095, + "step": 5735, + "task_loss": 3.240359306335449 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9998676826320634, + "compression/movement_sparsity/importance_threshold": -9.267157570543721e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9236830376902906, + "compression/movement_sparsity/model_sparsity": 0.8919516837642363, + "compression_loss": 105.58422088623047, + "distillation_loss": 4.202365875244141, + "epoch": 4.85, + "learning_rate": 2.8618390156851697e-05, + "loss": 109.624, + "step": 5736, + "task_loss": 1.9415459632873535 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9998698636433183, + "compression/movement_sparsity/importance_threshold": -9.114405325862454e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9237067190872155, + "compression/movement_sparsity/model_sparsity": 0.8919745516323242, + "compression_loss": 105.58385467529297, + "distillation_loss": 4.08169412612915, + "epoch": 4.85, + "learning_rate": 2.8613693998309384e-05, + "loss": 109.8512, + "step": 5737, + "task_loss": 2.1521048545837402 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9998720205552478, + "compression/movement_sparsity/importance_threshold": -8.963340934024788e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9237517447442085, + "compression/movement_sparsity/model_sparsity": 0.8920180305194844, + "compression_loss": 105.58351135253906, + "distillation_loss": 5.323280334472656, + "epoch": 4.85, + "learning_rate": 2.860899783976707e-05, + "loss": 109.881, + "step": 5738, + "task_loss": 3.480252742767334 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9998741535017373, + "compression/movement_sparsity/importance_threshold": -8.813955018061037e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9238464345594048, + "compression/movement_sparsity/model_sparsity": 0.8921094674482288, + "compression_loss": 105.58313751220703, + "distillation_loss": 4.512420654296875, + "epoch": 4.85, + "learning_rate": 2.860430168122476e-05, + "loss": 109.4066, + "step": 5739, + "task_loss": 1.8507853746414185 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9998762626166716, + "compression/movement_sparsity/importance_threshold": -8.666238201018861e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9237801481115171, + "compression/movement_sparsity/model_sparsity": 0.892045458143747, + "compression_loss": 105.58281707763672, + "distillation_loss": 2.7434792518615723, + "epoch": 4.85, + "learning_rate": 2.859960552268245e-05, + "loss": 109.1152, + "step": 5740, + "task_loss": 2.4181931018829346 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.999878348033936, + "compression/movement_sparsity/importance_threshold": -8.520181105937247e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9237454010870262, + "compression/movement_sparsity/model_sparsity": 0.8920119047864417, + "compression_loss": 105.5824203491211, + "distillation_loss": 4.774539470672607, + "epoch": 4.85, + "learning_rate": 2.8594909364140132e-05, + "loss": 109.4507, + "step": 5741, + "task_loss": 3.4477450847625732 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9998804098874158, + "compression/movement_sparsity/importance_threshold": -8.375774355846508e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9237523648009256, + "compression/movement_sparsity/model_sparsity": 0.8920186292753457, + "compression_loss": 105.58198547363281, + "distillation_loss": 4.162593841552734, + "epoch": 4.85, + "learning_rate": 2.8590213205597822e-05, + "loss": 109.9438, + "step": 5742, + "task_loss": 2.7600302696228027 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9998824483109959, + "compression/movement_sparsity/importance_threshold": -8.233008573794304e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9237970327328895, + "compression/movement_sparsity/model_sparsity": 0.892061762726432, + "compression_loss": 105.5815658569336, + "distillation_loss": 5.0269036293029785, + "epoch": 4.85, + "learning_rate": 2.858551704705551e-05, + "loss": 109.9769, + "step": 5743, + "task_loss": 1.986816644668579 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9998844634385615, + "compression/movement_sparsity/importance_threshold": -8.091874382828296e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9238077048629236, + "compression/movement_sparsity/model_sparsity": 0.8920720682359682, + "compression_loss": 105.5811538696289, + "distillation_loss": 4.615762710571289, + "epoch": 4.85, + "learning_rate": 2.8580820888513198e-05, + "loss": 108.9336, + "step": 5744, + "task_loss": 2.7693445682525635 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9998864554039979, + "compression/movement_sparsity/importance_threshold": -7.95236240596145e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9238302057672525, + "compression/movement_sparsity/model_sparsity": 0.8920937961650125, + "compression_loss": 105.58065032958984, + "distillation_loss": 3.2815237045288086, + "epoch": 4.86, + "learning_rate": 2.8576124729970888e-05, + "loss": 109.2264, + "step": 5745, + "task_loss": 2.0057530403137207 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.99988842434119, + "compression/movement_sparsity/importance_threshold": -7.814463266267446e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9238214295798725, + "compression/movement_sparsity/model_sparsity": 0.8920853214666676, + "compression_loss": 105.58023071289062, + "distillation_loss": 5.442292213439941, + "epoch": 4.86, + "learning_rate": 2.857142857142857e-05, + "loss": 109.7506, + "step": 5746, + "task_loss": 3.8159613609313965 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9998903703840231, + "compression/movement_sparsity/importance_threshold": -7.678167586759252e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9238004430448333, + "compression/movement_sparsity/model_sparsity": 0.8920650558836692, + "compression_loss": 105.57976531982422, + "distillation_loss": 5.277232646942139, + "epoch": 4.86, + "learning_rate": 2.856673241288626e-05, + "loss": 109.6366, + "step": 5747, + "task_loss": 1.6122894287109375 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9998922936663822, + "compression/movement_sparsity/importance_threshold": -7.543465990493201e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.92378215137168, + "compression/movement_sparsity/model_sparsity": 0.8920473925857605, + "compression_loss": 105.57936096191406, + "distillation_loss": 4.456130504608154, + "epoch": 4.86, + "learning_rate": 2.856203625434395e-05, + "loss": 109.0319, + "step": 5748, + "task_loss": 3.589682102203369 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9998941943221527, + "compression/movement_sparsity/importance_threshold": -7.410349100499605e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9237086746507077, + "compression/movement_sparsity/model_sparsity": 0.8919764400161945, + "compression_loss": 105.57891845703125, + "distillation_loss": 3.532195568084717, + "epoch": 4.86, + "learning_rate": 2.8557340095801637e-05, + "loss": 109.139, + "step": 5749, + "task_loss": 2.792116641998291 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9998960724852195, + "compression/movement_sparsity/importance_threshold": -7.278807539817453e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9237567409704479, + "compression/movement_sparsity/model_sparsity": 0.8920228551099822, + "compression_loss": 105.57845306396484, + "distillation_loss": 5.264936447143555, + "epoch": 4.86, + "learning_rate": 2.855264393725932e-05, + "loss": 109.5717, + "step": 5750, + "task_loss": 2.6098172664642334 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9998979282894678, + "compression/movement_sparsity/importance_threshold": -7.148831931494404e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9238211910965197, + "compression/movement_sparsity/model_sparsity": 0.8920850911759518, + "compression_loss": 105.5780029296875, + "distillation_loss": 4.24552059173584, + "epoch": 4.86, + "learning_rate": 2.854794777871701e-05, + "loss": 109.4594, + "step": 5751, + "task_loss": 2.437236785888672 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9998997618687827, + "compression/movement_sparsity/importance_threshold": -7.020412898569445e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9238234805367058, + "compression/movement_sparsity/model_sparsity": 0.8920873019668244, + "compression_loss": 105.5774917602539, + "distillation_loss": 3.8099443912506104, + "epoch": 4.86, + "learning_rate": 2.85432516201747e-05, + "loss": 109.8693, + "step": 5752, + "task_loss": 2.272874593734741 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999015733570495, + "compression/movement_sparsity/importance_threshold": -6.893541064072889e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9238787967503685, + "compression/movement_sparsity/model_sparsity": 0.8921407178983752, + "compression_loss": 105.57706451416016, + "distillation_loss": 5.645594596862793, + "epoch": 4.86, + "learning_rate": 2.853855546163239e-05, + "loss": 109.4767, + "step": 5753, + "task_loss": 2.3698890209198 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999033628881532, + "compression/movement_sparsity/importance_threshold": -6.76820705106107e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9238786059636864, + "compression/movement_sparsity/model_sparsity": 0.8921405336658025, + "compression_loss": 105.57659149169922, + "distillation_loss": 3.5185301303863525, + "epoch": 4.86, + "learning_rate": 2.8533859303090072e-05, + "loss": 108.9473, + "step": 5754, + "task_loss": 1.791491150856018 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.999905130595979, + "compression/movement_sparsity/importance_threshold": -6.644401482555629e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9239113139555116, + "compression/movement_sparsity/model_sparsity": 0.892172118037487, + "compression_loss": 105.576171875, + "distillation_loss": 4.260524749755859, + "epoch": 4.86, + "learning_rate": 2.852916314454776e-05, + "loss": 109.6997, + "step": 5755, + "task_loss": 3.1400325298309326 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.999906876614412, + "compression/movement_sparsity/importance_threshold": -6.522114981604224e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.923898888972835, + "compression/movement_sparsity/model_sparsity": 0.892160119891189, + "compression_loss": 105.5757064819336, + "distillation_loss": 4.541952610015869, + "epoch": 4.87, + "learning_rate": 2.8524466986005448e-05, + "loss": 109.1445, + "step": 5756, + "task_loss": 2.5373404026031494 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999086010773374, + "compression/movement_sparsity/importance_threshold": -6.401338171254517e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9238775566369345, + "compression/movement_sparsity/model_sparsity": 0.8921395203866526, + "compression_loss": 105.57528686523438, + "distillation_loss": 2.946751594543457, + "epoch": 4.87, + "learning_rate": 2.8519770827463138e-05, + "loss": 109.086, + "step": 5757, + "task_loss": 1.5726948976516724 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999103041186402, + "compression/movement_sparsity/importance_threshold": -6.282061674536821e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9238479847011976, + "compression/movement_sparsity/model_sparsity": 0.8921109643378822, + "compression_loss": 105.57479858398438, + "distillation_loss": 4.083705902099609, + "epoch": 4.87, + "learning_rate": 2.851507466892082e-05, + "loss": 109.5793, + "step": 5758, + "task_loss": 3.4338252544403076 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999119858722058, + "compression/movement_sparsity/importance_threshold": -6.164276114481448e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9237985351780116, + "compression/movement_sparsity/model_sparsity": 0.8920632135579422, + "compression_loss": 105.57438659667969, + "distillation_loss": 4.285153388977051, + "epoch": 4.87, + "learning_rate": 2.851037851037851e-05, + "loss": 109.9572, + "step": 5759, + "task_loss": 3.281860828399658 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.999913646471919, + "compression/movement_sparsity/importance_threshold": -6.047972114153408e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9238135954017357, + "compression/movement_sparsity/model_sparsity": 0.8920777564166507, + "compression_loss": 105.57390594482422, + "distillation_loss": 4.781952857971191, + "epoch": 4.87, + "learning_rate": 2.85056823518362e-05, + "loss": 109.593, + "step": 5760, + "task_loss": 2.2279229164123535 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999152860516652, + "compression/movement_sparsity/importance_threshold": -5.933140296574338e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.923847150009463, + "compression/movement_sparsity/model_sparsity": 0.8921101583203765, + "compression_loss": 105.57349395751953, + "distillation_loss": 4.786239147186279, + "epoch": 4.87, + "learning_rate": 2.850098619329389e-05, + "loss": 110.3997, + "step": 5761, + "task_loss": 2.6992626190185547 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999169047453296, + "compression/movement_sparsity/importance_threshold": -5.819771284783226e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9238868574876904, + "compression/movement_sparsity/model_sparsity": 0.8921485017245723, + "compression_loss": 105.57303619384766, + "distillation_loss": 3.549647331237793, + "epoch": 4.87, + "learning_rate": 2.8496290034751576e-05, + "loss": 109.2187, + "step": 5762, + "task_loss": 1.4663758277893066 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.999918502686797, + "compression/movement_sparsity/importance_threshold": -5.707855701827733e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9239481238610036, + "compression/movement_sparsity/model_sparsity": 0.8922076634094847, + "compression_loss": 105.57261657714844, + "distillation_loss": 3.6878225803375244, + "epoch": 4.87, + "learning_rate": 2.849159387620926e-05, + "loss": 109.7149, + "step": 5763, + "task_loss": 2.080394983291626 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999200800099528, + "compression/movement_sparsity/importance_threshold": -5.597384170746844e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9239817261654014, + "compression/movement_sparsity/model_sparsity": 0.8922401113713537, + "compression_loss": 105.57218170166016, + "distillation_loss": 3.0918312072753906, + "epoch": 4.87, + "learning_rate": 2.848689771766695e-05, + "loss": 108.8809, + "step": 5764, + "task_loss": 1.2333855628967285 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999216368486822, + "compression/movement_sparsity/importance_threshold": -5.488347314570874e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.924062488552799, + "compression/movement_sparsity/model_sparsity": 0.8923180993222901, + "compression_loss": 105.57167053222656, + "distillation_loss": 4.3989715576171875, + "epoch": 4.87, + "learning_rate": 2.848220155912464e-05, + "loss": 109.465, + "step": 5765, + "task_loss": 3.06538987159729 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999231733368701, + "compression/movement_sparsity/importance_threshold": -5.380735756356156e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.924017892165841, + "compression/movement_sparsity/model_sparsity": 0.8922750349584185, + "compression_loss": 105.57122802734375, + "distillation_loss": 5.130695819854736, + "epoch": 4.87, + "learning_rate": 2.8477505400582328e-05, + "loss": 110.0723, + "step": 5766, + "task_loss": 2.618840456008911 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999246896084018, + "compression/movement_sparsity/importance_threshold": -5.274540119133003e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9240098791251896, + "compression/movement_sparsity/model_sparsity": 0.8922672971903646, + "compression_loss": 105.57072448730469, + "distillation_loss": 4.058493137359619, + "epoch": 4.87, + "learning_rate": 2.847280924204001e-05, + "loss": 109.6253, + "step": 5767, + "task_loss": 2.040745258331299 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999261857971624, + "compression/movement_sparsity/importance_threshold": -5.169751025940403e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9239878671117339, + "compression/movement_sparsity/model_sparsity": 0.8922460413572879, + "compression_loss": 105.57025146484375, + "distillation_loss": 3.8542556762695312, + "epoch": 4.88, + "learning_rate": 2.84681130834977e-05, + "loss": 109.5365, + "step": 5768, + "task_loss": 2.327390670776367 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999276620370371, + "compression/movement_sparsity/importance_threshold": -5.066359099817341e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9239674767850765, + "compression/movement_sparsity/model_sparsity": 0.8922263515010792, + "compression_loss": 105.56973266601562, + "distillation_loss": 4.185609817504883, + "epoch": 4.88, + "learning_rate": 2.8463416924955387e-05, + "loss": 109.5494, + "step": 5769, + "task_loss": 2.3065922260284424 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999291184619109, + "compression/movement_sparsity/importance_threshold": -4.964354963811479e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9240229837854215, + "compression/movement_sparsity/model_sparsity": 0.8922799516652028, + "compression_loss": 105.56925201416016, + "distillation_loss": 3.783383846282959, + "epoch": 4.88, + "learning_rate": 2.8458720766413077e-05, + "loss": 109.0953, + "step": 5770, + "task_loss": 1.3077412843704224 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999305552056691, + "compression/movement_sparsity/importance_threshold": -4.863729240944456e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9240214455677964, + "compression/movement_sparsity/model_sparsity": 0.8922784662900853, + "compression_loss": 105.56871032714844, + "distillation_loss": 4.315218925476074, + "epoch": 4.88, + "learning_rate": 2.845402460787076e-05, + "loss": 109.6265, + "step": 5771, + "task_loss": 2.119558095932007 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999319724021967, + "compression/movement_sparsity/importance_threshold": -4.7644725542812794e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9240678544282352, + "compression/movement_sparsity/model_sparsity": 0.8923232808633976, + "compression_loss": 105.56824493408203, + "distillation_loss": 3.453381299972534, + "epoch": 4.88, + "learning_rate": 2.844932844932845e-05, + "loss": 109.5374, + "step": 5772, + "task_loss": 2.280989170074463 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.999933370185379, + "compression/movement_sparsity/importance_threshold": -4.666575526843589e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9240410488993898, + "compression/movement_sparsity/model_sparsity": 0.8922973961869315, + "compression_loss": 105.56769561767578, + "distillation_loss": 3.2475063800811768, + "epoch": 4.88, + "learning_rate": 2.844463229078614e-05, + "loss": 108.978, + "step": 5773, + "task_loss": 1.432131290435791 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999347486891009, + "compression/movement_sparsity/importance_threshold": -4.5700287816790447e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9240283735091929, + "compression/movement_sparsity/model_sparsity": 0.892285156235382, + "compression_loss": 105.56719970703125, + "distillation_loss": 4.007950782775879, + "epoch": 4.88, + "learning_rate": 2.843993613224383e-05, + "loss": 108.9299, + "step": 5774, + "task_loss": 1.9063670635223389 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999361080472479, + "compression/movement_sparsity/importance_threshold": -4.47482294181796e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9240646587513088, + "compression/movement_sparsity/model_sparsity": 0.8923201949678047, + "compression_loss": 105.5666732788086, + "distillation_loss": 4.3854804039001465, + "epoch": 4.88, + "learning_rate": 2.8435239973701515e-05, + "loss": 109.769, + "step": 5775, + "task_loss": 2.359445333480835 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999374483937048, + "compression/movement_sparsity/importance_threshold": -4.380948630307996e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9241024464385469, + "compression/movement_sparsity/model_sparsity": 0.8923566845317377, + "compression_loss": 105.56620025634766, + "distillation_loss": 4.397212982177734, + "epoch": 4.88, + "learning_rate": 2.84305438151592e-05, + "loss": 109.9174, + "step": 5776, + "task_loss": 3.209820032119751 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999387698623569, + "compression/movement_sparsity/importance_threshold": -4.288396470196812e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9241573810788453, + "compression/movement_sparsity/model_sparsity": 0.8924097319981431, + "compression_loss": 105.56568908691406, + "distillation_loss": 2.1097731590270996, + "epoch": 4.88, + "learning_rate": 2.8425847656616888e-05, + "loss": 108.9489, + "step": 5777, + "task_loss": 2.133603096008301 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999400725870893, + "compression/movement_sparsity/importance_threshold": -4.197157084506048e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9241647621386119, + "compression/movement_sparsity/model_sparsity": 0.8924168594957999, + "compression_loss": 105.5652084350586, + "distillation_loss": 4.154838562011719, + "epoch": 4.88, + "learning_rate": 2.8421151498074578e-05, + "loss": 109.4296, + "step": 5778, + "task_loss": 3.022413730621338 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999413567017872, + "compression/movement_sparsity/importance_threshold": -4.1072210962833644e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9241285007448312, + "compression/movement_sparsity/model_sparsity": 0.8923818437924487, + "compression_loss": 105.5647201538086, + "distillation_loss": 3.4539031982421875, + "epoch": 4.88, + "learning_rate": 2.8416455339532268e-05, + "loss": 108.6337, + "step": 5779, + "task_loss": 2.017198324203491 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999426223403356, + "compression/movement_sparsity/importance_threshold": -4.018579128585095e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.924114000956986, + "compression/movement_sparsity/model_sparsity": 0.8923678421169225, + "compression_loss": 105.56423950195312, + "distillation_loss": 4.885636329650879, + "epoch": 4.89, + "learning_rate": 2.841175918098995e-05, + "loss": 109.6398, + "step": 5780, + "task_loss": 2.961885690689087 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999438696366199, + "compression/movement_sparsity/importance_threshold": -3.931221804415533e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.924143143622688, + "compression/movement_sparsity/model_sparsity": 0.8923959836424044, + "compression_loss": 105.5636978149414, + "distillation_loss": 2.696073532104492, + "epoch": 4.89, + "learning_rate": 2.840706302244764e-05, + "loss": 109.4633, + "step": 5781, + "task_loss": 2.0201523303985596 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999450987245249, + "compression/movement_sparsity/importance_threshold": -3.8451397468483584e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9241687448106023, + "compression/movement_sparsity/model_sparsity": 0.8924207053507552, + "compression_loss": 105.56317138671875, + "distillation_loss": 5.161613464355469, + "epoch": 4.89, + "learning_rate": 2.8402366863905327e-05, + "loss": 110.2486, + "step": 5782, + "task_loss": 2.3222835063934326 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.999946309737936, + "compression/movement_sparsity/importance_threshold": -3.7603235789052114e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9241447056886484, + "compression/movement_sparsity/model_sparsity": 0.8923974920465935, + "compression_loss": 105.56259155273438, + "distillation_loss": 4.012014389038086, + "epoch": 4.89, + "learning_rate": 2.8397670705363016e-05, + "loss": 109.468, + "step": 5783, + "task_loss": 2.495551586151123 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999475028107382, + "compression/movement_sparsity/importance_threshold": -3.6767639236337524e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9241288823181956, + "compression/movement_sparsity/model_sparsity": 0.8923822122575942, + "compression_loss": 105.56210327148438, + "distillation_loss": 3.2486085891723633, + "epoch": 4.89, + "learning_rate": 2.83929745468207e-05, + "loss": 109.3132, + "step": 5784, + "task_loss": 2.2157177925109863 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999486780768168, + "compression/movement_sparsity/importance_threshold": -3.594451404072968e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9240917385360098, + "compression/movement_sparsity/model_sparsity": 0.8923463444785942, + "compression_loss": 105.56159973144531, + "distillation_loss": 4.397359848022461, + "epoch": 4.89, + "learning_rate": 2.838827838827839e-05, + "loss": 109.1712, + "step": 5785, + "task_loss": 2.541592597961426 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999498356700568, + "compression/movement_sparsity/importance_threshold": -3.513376643244498e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9241066556697223, + "compression/movement_sparsity/model_sparsity": 0.8923607491628731, + "compression_loss": 105.56108093261719, + "distillation_loss": 5.419126510620117, + "epoch": 4.89, + "learning_rate": 2.838358222973608e-05, + "loss": 110.1396, + "step": 5786, + "task_loss": 3.4935543537139893 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999509757243433, + "compression/movement_sparsity/importance_threshold": -3.4335302642220233e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9240641460121004, + "compression/movement_sparsity/model_sparsity": 0.8923196998427656, + "compression_loss": 105.56047821044922, + "distillation_loss": 4.754867076873779, + "epoch": 4.89, + "learning_rate": 2.8378886071193765e-05, + "loss": 109.0265, + "step": 5787, + "task_loss": 2.8066933155059814 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999520983735616, + "compression/movement_sparsity/importance_threshold": -3.35490289001851e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9240720159627401, + "compression/movement_sparsity/model_sparsity": 0.89232729943639, + "compression_loss": 105.55996704101562, + "distillation_loss": 4.321389675140381, + "epoch": 4.89, + "learning_rate": 2.837418991265145e-05, + "loss": 109.6211, + "step": 5788, + "task_loss": 1.9351894855499268 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999532037515967, + "compression/movement_sparsity/importance_threshold": -3.277485143681619e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9241102209958454, + "compression/movement_sparsity/model_sparsity": 0.8923641920090757, + "compression_loss": 105.5594253540039, + "distillation_loss": 4.105820178985596, + "epoch": 4.89, + "learning_rate": 2.8369493754109138e-05, + "loss": 109.2398, + "step": 5789, + "task_loss": 1.4320447444915771 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999542919923339, + "compression/movement_sparsity/importance_threshold": -3.201267648250336e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9241332704118855, + "compression/movement_sparsity/model_sparsity": 0.8923864496067665, + "compression_loss": 105.55889892578125, + "distillation_loss": 3.9293527603149414, + "epoch": 4.89, + "learning_rate": 2.8364797595566827e-05, + "loss": 109.7271, + "step": 5790, + "task_loss": 3.0048444271087646 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999553632296582, + "compression/movement_sparsity/importance_threshold": -3.1262410267723223e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.924100836675916, + "compression/movement_sparsity/model_sparsity": 0.8923551300694054, + "compression_loss": 105.55841827392578, + "distillation_loss": 5.304651260375977, + "epoch": 4.89, + "learning_rate": 2.8360101437024517e-05, + "loss": 109.4457, + "step": 5791, + "task_loss": 3.111219644546509 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999564175974547, + "compression/movement_sparsity/importance_threshold": -3.0523959022778907e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.924108074645671, + "compression/movement_sparsity/model_sparsity": 0.8923621193926327, + "compression_loss": 105.55789184570312, + "distillation_loss": 5.228198051452637, + "epoch": 4.9, + "learning_rate": 2.8355405278482207e-05, + "loss": 109.8292, + "step": 5792, + "task_loss": 2.5225722789764404 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999574552296088, + "compression/movement_sparsity/importance_threshold": -2.979722897806028e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9241846278018933, + "compression/movement_sparsity/model_sparsity": 0.8924360427124336, + "compression_loss": 105.557373046875, + "distillation_loss": 4.680537700653076, + "epoch": 4.9, + "learning_rate": 2.835070911993989e-05, + "loss": 109.5083, + "step": 5793, + "task_loss": 3.023618698120117 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999584762600052, + "compression/movement_sparsity/importance_threshold": -2.9082126364130684e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9241648813802882, + "compression/movement_sparsity/model_sparsity": 0.8924169746411579, + "compression_loss": 105.55689239501953, + "distillation_loss": 3.011064291000366, + "epoch": 4.9, + "learning_rate": 2.8346012961397576e-05, + "loss": 109.1844, + "step": 5794, + "task_loss": 2.1663336753845215 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999594808225295, + "compression/movement_sparsity/importance_threshold": -2.8378557411119776e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9241316725734223, + "compression/movement_sparsity/model_sparsity": 0.8923849066589701, + "compression_loss": 105.55635833740234, + "distillation_loss": 5.132295608520508, + "epoch": 4.9, + "learning_rate": 2.8341316802855266e-05, + "loss": 110.3995, + "step": 5795, + "task_loss": 2.7465851306915283 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999604690510666, + "compression/movement_sparsity/importance_threshold": -2.7686428349590897e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9241634027835014, + "compression/movement_sparsity/model_sparsity": 0.8924155468387193, + "compression_loss": 105.55585479736328, + "distillation_loss": 4.136115550994873, + "epoch": 4.9, + "learning_rate": 2.8336620644312956e-05, + "loss": 109.5348, + "step": 5796, + "task_loss": 2.054055690765381 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999614410795017, + "compression/movement_sparsity/importance_threshold": -2.700564541002065e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.924181134020776, + "compression/movement_sparsity/model_sparsity": 0.8924326689534458, + "compression_loss": 105.55535888671875, + "distillation_loss": 4.318592071533203, + "epoch": 4.9, + "learning_rate": 2.833192448577064e-05, + "loss": 108.8359, + "step": 5797, + "task_loss": 1.6425416469573975 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.99996239704172, + "compression/movement_sparsity/importance_threshold": -2.63361148225387e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9241325907343303, + "compression/movement_sparsity/model_sparsity": 0.8923857932782262, + "compression_loss": 105.55484771728516, + "distillation_loss": 4.442142963409424, + "epoch": 4.9, + "learning_rate": 2.8327228327228328e-05, + "loss": 109.8726, + "step": 5798, + "task_loss": 2.545729160308838 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999633370716063, + "compression/movement_sparsity/importance_threshold": -2.567774281788185e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.924181885243337, + "compression/movement_sparsity/model_sparsity": 0.8924333943692009, + "compression_loss": 105.55436706542969, + "distillation_loss": 3.740147113800049, + "epoch": 4.9, + "learning_rate": 2.8322532168686018e-05, + "loss": 110.0629, + "step": 5799, + "task_loss": 1.9118313789367676 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999642613030463, + "compression/movement_sparsity/importance_threshold": -2.5030435626179764e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9241856771286452, + "compression/movement_sparsity/model_sparsity": 0.8924370559915835, + "compression_loss": 105.55387878417969, + "distillation_loss": 3.5427160263061523, + "epoch": 4.9, + "learning_rate": 2.8317836010143704e-05, + "loss": 109.1896, + "step": 5800, + "task_loss": 1.6276402473449707 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999651698699247, + "compression/movement_sparsity/importance_threshold": -2.439409947790905e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9241784868555607, + "compression/movement_sparsity/model_sparsity": 0.8924301127264994, + "compression_loss": 105.55339813232422, + "distillation_loss": 4.611199378967285, + "epoch": 4.9, + "learning_rate": 2.831313985160139e-05, + "loss": 109.7936, + "step": 5801, + "task_loss": 3.32287335395813 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999660629061268, + "compression/movement_sparsity/importance_threshold": -2.3768640603546304e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.924137610808905, + "compression/movement_sparsity/model_sparsity": 0.8923906408977957, + "compression_loss": 105.55297088623047, + "distillation_loss": 3.189418077468872, + "epoch": 4.9, + "learning_rate": 2.8308443693059077e-05, + "loss": 110.0742, + "step": 5802, + "task_loss": 3.178426742553711 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999669405455378, + "compression/movement_sparsity/importance_threshold": -2.3153965233394663e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.924112951630234, + "compression/movement_sparsity/model_sparsity": 0.8923668288377726, + "compression_loss": 105.55245971679688, + "distillation_loss": 4.0032148361206055, + "epoch": 4.9, + "learning_rate": 2.8303747534516767e-05, + "loss": 109.4448, + "step": 5803, + "task_loss": 2.4817261695861816 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999678029220427, + "compression/movement_sparsity/importance_threshold": -2.2549979597843994e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9241331034735386, + "compression/movement_sparsity/model_sparsity": 0.8923862884032654, + "compression_loss": 105.55204010009766, + "distillation_loss": 3.3149986267089844, + "epoch": 4.91, + "learning_rate": 2.8299051375974456e-05, + "loss": 109.5433, + "step": 5804, + "task_loss": 2.5761148929595947 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999686501695267, + "compression/movement_sparsity/importance_threshold": -2.19565899273709e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9241516336300448, + "compression/movement_sparsity/model_sparsity": 0.8924041819918901, + "compression_loss": 105.5515365600586, + "distillation_loss": 5.790671348571777, + "epoch": 4.91, + "learning_rate": 2.829435521743214e-05, + "loss": 110.3215, + "step": 5805, + "task_loss": 2.727064371109009 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.999969482421875, + "compression/movement_sparsity/importance_threshold": -2.137370245236525e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9241480683039217, + "compression/movement_sparsity/model_sparsity": 0.8924007391456875, + "compression_loss": 105.55103302001953, + "distillation_loss": 2.8517465591430664, + "epoch": 4.91, + "learning_rate": 2.828965905888983e-05, + "loss": 109.1225, + "step": 5806, + "task_loss": 2.2918078899383545 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999702998129727, + "compression/movement_sparsity/importance_threshold": -2.0801223403130176e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9241553539703472, + "compression/movement_sparsity/model_sparsity": 0.892407774527058, + "compression_loss": 105.55044555664062, + "distillation_loss": 4.233865737915039, + "epoch": 4.91, + "learning_rate": 2.8284962900347516e-05, + "loss": 109.2089, + "step": 5807, + "task_loss": 2.0620834827423096 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999711024767048, + "compression/movement_sparsity/importance_threshold": -2.0239059010229016e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9242205433948124, + "compression/movement_sparsity/model_sparsity": 0.8924707244942468, + "compression_loss": 105.5499038696289, + "distillation_loss": 4.127224922180176, + "epoch": 4.91, + "learning_rate": 2.8280266741805205e-05, + "loss": 109.7811, + "step": 5808, + "task_loss": 1.9554705619812012 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999718905469568, + "compression/movement_sparsity/importance_threshold": -1.9687115503878166e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9242476470278488, + "compression/movement_sparsity/model_sparsity": 0.8924968970341077, + "compression_loss": 105.54935455322266, + "distillation_loss": 5.232905387878418, + "epoch": 4.91, + "learning_rate": 2.8275570583262895e-05, + "loss": 109.798, + "step": 5809, + "task_loss": 3.2712509632110596 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999726641576134, + "compression/movement_sparsity/importance_threshold": -1.9145299114554232e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9243031898006966, + "compression/movement_sparsity/model_sparsity": 0.8925505317418387, + "compression_loss": 105.54874420166016, + "distillation_loss": 4.367021560668945, + "epoch": 4.91, + "learning_rate": 2.8270874424720578e-05, + "loss": 109.8634, + "step": 5810, + "task_loss": 2.90411376953125 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999734234425601, + "compression/movement_sparsity/importance_threshold": -1.861351607264708e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9242878076244463, + "compression/movement_sparsity/model_sparsity": 0.8925356779906638, + "compression_loss": 105.54820251464844, + "distillation_loss": 4.384200096130371, + "epoch": 4.91, + "learning_rate": 2.8266178266178268e-05, + "loss": 110.0096, + "step": 5811, + "task_loss": 2.4015374183654785 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999741685356818, + "compression/movement_sparsity/importance_threshold": -1.8091672608633314e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9242968222951791, + "compression/movement_sparsity/model_sparsity": 0.8925443829797244, + "compression_loss": 105.54763793945312, + "distillation_loss": 2.603367567062378, + "epoch": 4.91, + "learning_rate": 2.8261482107635957e-05, + "loss": 109.0088, + "step": 5812, + "task_loss": 1.191414475440979 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999748995708638, + "compression/movement_sparsity/importance_threshold": -1.757967495272933e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.924330615386259, + "compression/movement_sparsity/model_sparsity": 0.8925770151741662, + "compression_loss": 105.54710388183594, + "distillation_loss": 2.8490684032440186, + "epoch": 4.91, + "learning_rate": 2.8256785949093644e-05, + "loss": 109.0368, + "step": 5813, + "task_loss": 2.2161474227905273 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999756166819911, + "compression/movement_sparsity/importance_threshold": -1.7077429335585204e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9243826286054867, + "compression/movement_sparsity/model_sparsity": 0.892627241579302, + "compression_loss": 105.54655456542969, + "distillation_loss": 3.742361545562744, + "epoch": 4.91, + "learning_rate": 2.8252089790551327e-05, + "loss": 109.2722, + "step": 5814, + "task_loss": 2.342648983001709 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.999976320002949, + "compression/movement_sparsity/importance_threshold": -1.6584841987417331e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9244133810338196, + "compression/movement_sparsity/model_sparsity": 0.8926569375671161, + "compression_loss": 105.54597473144531, + "distillation_loss": 4.044370651245117, + "epoch": 4.91, + "learning_rate": 2.8247393632009016e-05, + "loss": 108.9997, + "step": 5815, + "task_loss": 2.2245523929595947 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999770096676225, + "compression/movement_sparsity/importance_threshold": -1.610181913861558e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9243723976696552, + "compression/movement_sparsity/model_sparsity": 0.8926173621075902, + "compression_loss": 105.54549407958984, + "distillation_loss": 4.161084175109863, + "epoch": 4.92, + "learning_rate": 2.8242697473466706e-05, + "loss": 109.3121, + "step": 5816, + "task_loss": 1.9251513481140137 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999776858098969, + "compression/movement_sparsity/importance_threshold": -1.5628267019656555e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.924418377260059, + "compression/movement_sparsity/model_sparsity": 0.892661762157614, + "compression_loss": 105.54485321044922, + "distillation_loss": 3.7956342697143555, + "epoch": 4.92, + "learning_rate": 2.8238001314924396e-05, + "loss": 109.4383, + "step": 5817, + "task_loss": 2.2154626846313477 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999783485636571, + "compression/movement_sparsity/importance_threshold": -1.5164091860930123e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9243449720840927, + "compression/movement_sparsity/model_sparsity": 0.8925908786752628, + "compression_loss": 105.54429626464844, + "distillation_loss": 4.210668563842773, + "epoch": 4.92, + "learning_rate": 2.823330515638208e-05, + "loss": 109.3034, + "step": 5818, + "task_loss": 1.7352038621902466 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999789980627884, + "compression/movement_sparsity/importance_threshold": -1.4709199892739416e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9243694047035785, + "compression/movement_sparsity/model_sparsity": 0.8926144719591058, + "compression_loss": 105.54365539550781, + "distillation_loss": 4.7040205001831055, + "epoch": 4.92, + "learning_rate": 2.822860899783977e-05, + "loss": 109.8601, + "step": 5819, + "task_loss": 2.434617757797241 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.999979634441176, + "compression/movement_sparsity/importance_threshold": -1.4263497345647774e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9243058846625823, + "compression/movement_sparsity/model_sparsity": 0.8925531340269283, + "compression_loss": 105.54315185546875, + "distillation_loss": 4.129504203796387, + "epoch": 4.92, + "learning_rate": 2.8223912839297455e-05, + "loss": 109.5991, + "step": 5820, + "task_loss": 3.0206573009490967 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999802578327048, + "compression/movement_sparsity/importance_threshold": -1.382689044995833e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9243949701189899, + "compression/movement_sparsity/model_sparsity": 0.8926391591238493, + "compression_loss": 105.54249572753906, + "distillation_loss": 4.849095344543457, + "epoch": 4.92, + "learning_rate": 2.8219216680755145e-05, + "loss": 109.2778, + "step": 5821, + "task_loss": 3.299720525741577 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999808683712603, + "compression/movement_sparsity/importance_threshold": -1.3399285435974212e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.924425352898126, + "compression/movement_sparsity/model_sparsity": 0.8926684981610538, + "compression_loss": 105.54194641113281, + "distillation_loss": 3.5940475463867188, + "epoch": 4.92, + "learning_rate": 2.8214520522212834e-05, + "loss": 109.2517, + "step": 5822, + "task_loss": 2.327202796936035 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999814661907274, + "compression/movement_sparsity/importance_threshold": -1.2980588534258763e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9244232423204545, + "compression/movement_sparsity/model_sparsity": 0.8926664600882181, + "compression_loss": 105.54135131835938, + "distillation_loss": 4.946146488189697, + "epoch": 4.92, + "learning_rate": 2.8209824363670517e-05, + "loss": 109.588, + "step": 5823, + "task_loss": 3.677462339401245 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999820514249912, + "compression/movement_sparsity/importance_threshold": -1.2570705975115115e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9243963175499327, + "compression/movement_sparsity/model_sparsity": 0.8926404602663941, + "compression_loss": 105.54085540771484, + "distillation_loss": 3.913628101348877, + "epoch": 4.92, + "learning_rate": 2.8205128205128207e-05, + "loss": 109.2946, + "step": 5824, + "task_loss": 2.3776330947875977 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999826242079369, + "compression/movement_sparsity/importance_threshold": -1.216954398901987e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9244144303605716, + "compression/movement_sparsity/model_sparsity": 0.892657950846266, + "compression_loss": 105.54023742675781, + "distillation_loss": 4.738117694854736, + "epoch": 4.92, + "learning_rate": 2.8200432046585897e-05, + "loss": 109.4187, + "step": 5825, + "task_loss": 2.7193286418914795 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999831846734497, + "compression/movement_sparsity/importance_threshold": -1.1777008806276162e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9244156227773351, + "compression/movement_sparsity/model_sparsity": 0.8926591022998455, + "compression_loss": 105.5396957397461, + "distillation_loss": 4.396115303039551, + "epoch": 4.92, + "learning_rate": 2.8195735888043583e-05, + "loss": 109.6467, + "step": 5826, + "task_loss": 2.877631425857544 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999837329554148, + "compression/movement_sparsity/importance_threshold": -1.1393006657360594e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9243864204907949, + "compression/movement_sparsity/model_sparsity": 0.8926309032016846, + "compression_loss": 105.53909301757812, + "distillation_loss": 3.7712345123291016, + "epoch": 4.93, + "learning_rate": 2.8191039729501266e-05, + "loss": 109.2341, + "step": 5827, + "task_loss": 2.227628469467163 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999842691877171, + "compression/movement_sparsity/importance_threshold": -1.1017443772576296e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9244111512144717, + "compression/movement_sparsity/model_sparsity": 0.8926547843489225, + "compression_loss": 105.53858184814453, + "distillation_loss": 3.462930202484131, + "epoch": 4.93, + "learning_rate": 2.8186343570958956e-05, + "loss": 109.922, + "step": 5828, + "task_loss": 1.478043556213379 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.999984793504242, + "compression/movement_sparsity/importance_threshold": -1.065022638248661e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.9244567492315112, + "compression/movement_sparsity/model_sparsity": 0.8926988159338008, + "compression_loss": 105.5379867553711, + "distillation_loss": 3.968722105026245, + "epoch": 4.93, + "learning_rate": 2.8181647412416645e-05, + "loss": 109.1859, + "step": 5829, + "task_loss": 2.567662239074707 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999853060388745, + "compression/movement_sparsity/importance_threshold": -1.0291260717221196e-07, + "compression/movement_sparsity/linear_layer_sparsity": 0.924458263600801, + "compression/movement_sparsity/model_sparsity": 0.8927002782798468, + "compression_loss": 105.53746032714844, + "distillation_loss": 5.396866798400879, + "epoch": 4.93, + "learning_rate": 2.8176951253874335e-05, + "loss": 110.2105, + "step": 5830, + "task_loss": 3.264470338821411 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999858069254998, + "compression/movement_sparsity/importance_threshold": -9.940453007430128e-08, + "compression/movement_sparsity/linear_layer_sparsity": 0.9244797509508809, + "compression/movement_sparsity/model_sparsity": 0.8927210274733485, + "compression_loss": 105.53691101074219, + "distillation_loss": 5.850165843963623, + "epoch": 4.93, + "learning_rate": 2.8172255095332018e-05, + "loss": 109.6238, + "step": 5831, + "task_loss": 2.836446523666382 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999862962980028, + "compression/movement_sparsity/importance_threshold": -9.597709483503275e-08, + "compression/movement_sparsity/linear_layer_sparsity": 0.9244462082673212, + "compression/movement_sparsity/model_sparsity": 0.8926886370841585, + "compression_loss": 105.53641510009766, + "distillation_loss": 2.696895122528076, + "epoch": 4.93, + "learning_rate": 2.8167558936789708e-05, + "loss": 108.5432, + "step": 5832, + "task_loss": 2.3448662757873535 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.999986774290269, + "compression/movement_sparsity/importance_threshold": -9.262936375657033e-08, + "compression/movement_sparsity/linear_layer_sparsity": 0.9244630094195201, + "compression/movement_sparsity/model_sparsity": 0.892704861065093, + "compression_loss": 105.53585815429688, + "distillation_loss": 2.3151326179504395, + "epoch": 4.93, + "learning_rate": 2.8162862778247394e-05, + "loss": 109.1485, + "step": 5833, + "task_loss": 0.9078324437141418 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999872410361835, + "compression/movement_sparsity/importance_threshold": -8.936039914368005e-08, + "compression/movement_sparsity/linear_layer_sparsity": 0.9244384217858549, + "compression/movement_sparsity/model_sparsity": 0.8926811180922846, + "compression_loss": 105.53533172607422, + "distillation_loss": 4.262421607971191, + "epoch": 4.93, + "learning_rate": 2.8158166619705084e-05, + "loss": 109.3855, + "step": 5834, + "task_loss": 2.0490643978118896 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999876966696312, + "compression/movement_sparsity/importance_threshold": -8.616926330112795e-08, + "compression/movement_sparsity/linear_layer_sparsity": 0.9244081463242275, + "compression/movement_sparsity/model_sparsity": 0.8926518826859022, + "compression_loss": 105.53484344482422, + "distillation_loss": 3.390775203704834, + "epoch": 4.93, + "learning_rate": 2.8153470461162767e-05, + "loss": 109.3964, + "step": 5835, + "task_loss": 1.9313862323760986 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999881413244974, + "compression/movement_sparsity/importance_threshold": -8.305501853194536e-08, + "compression/movement_sparsity/linear_layer_sparsity": 0.9244277496558209, + "compression/movement_sparsity/model_sparsity": 0.8926708125827485, + "compression_loss": 105.53431701660156, + "distillation_loss": 4.898916244506836, + "epoch": 4.93, + "learning_rate": 2.8148774302620457e-05, + "loss": 109.5835, + "step": 5836, + "task_loss": 3.042492628097534 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999885751346672, + "compression/movement_sparsity/importance_threshold": -8.001672714003094e-08, + "compression/movement_sparsity/linear_layer_sparsity": 0.9244455405139336, + "compression/movement_sparsity/model_sparsity": 0.8926879922701539, + "compression_loss": 105.53374481201172, + "distillation_loss": 3.7849502563476562, + "epoch": 4.93, + "learning_rate": 2.8144078144078146e-05, + "loss": 110.0813, + "step": 5837, + "task_loss": 1.8033572435379028 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999889982340258, + "compression/movement_sparsity/importance_threshold": -7.70534514310181e-08, + "compression/movement_sparsity/linear_layer_sparsity": 0.9244629617228495, + "compression/movement_sparsity/model_sparsity": 0.8927048150069498, + "compression_loss": 105.53325653076172, + "distillation_loss": 2.847432851791382, + "epoch": 4.93, + "learning_rate": 2.8139381985535833e-05, + "loss": 108.9619, + "step": 5838, + "task_loss": 1.4793275594711304 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999894107564582, + "compression/movement_sparsity/importance_threshold": -7.41642537070708e-08, + "compression/movement_sparsity/linear_layer_sparsity": 0.9245609068358105, + "compression/movement_sparsity/model_sparsity": 0.8927993954039661, + "compression_loss": 105.53274536132812, + "distillation_loss": 3.3297038078308105, + "epoch": 4.94, + "learning_rate": 2.8134685826993522e-05, + "loss": 109.4299, + "step": 5839, + "task_loss": 1.1347813606262207 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999898128358496, + "compression/movement_sparsity/importance_threshold": -7.134819627295508e-08, + "compression/movement_sparsity/linear_layer_sparsity": 0.9245997915464711, + "compression/movement_sparsity/model_sparsity": 0.8928369443051921, + "compression_loss": 105.53221893310547, + "distillation_loss": 4.980269908905029, + "epoch": 4.94, + "learning_rate": 2.8129989668451205e-05, + "loss": 109.5584, + "step": 5840, + "task_loss": 2.2784786224365234 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999902046060852, + "compression/movement_sparsity/importance_threshold": -6.86043414325696e-08, + "compression/movement_sparsity/linear_layer_sparsity": 0.9245908245724089, + "compression/movement_sparsity/model_sparsity": 0.8928282853742746, + "compression_loss": 105.5317153930664, + "distillation_loss": 4.127524375915527, + "epoch": 4.94, + "learning_rate": 2.8125293509908895e-05, + "loss": 108.7755, + "step": 5841, + "task_loss": 2.0062153339385986 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999905862010502, + "compression/movement_sparsity/importance_threshold": -6.593175148981306e-08, + "compression/movement_sparsity/linear_layer_sparsity": 0.9246091520180653, + "compression/movement_sparsity/model_sparsity": 0.8928459832157909, + "compression_loss": 105.53118896484375, + "distillation_loss": 3.4782378673553467, + "epoch": 4.94, + "learning_rate": 2.8120597351366585e-05, + "loss": 110.2042, + "step": 5842, + "task_loss": 1.7551798820495605 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999909577546297, + "compression/movement_sparsity/importance_threshold": -6.332948874771677e-08, + "compression/movement_sparsity/linear_layer_sparsity": 0.9246162349736409, + "compression/movement_sparsity/model_sparsity": 0.8928528228500529, + "compression_loss": 105.53069305419922, + "distillation_loss": 3.982268810272217, + "epoch": 4.94, + "learning_rate": 2.8115901192824275e-05, + "loss": 108.8798, + "step": 5843, + "task_loss": 2.2816355228424072 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999913194007086, + "compression/movement_sparsity/importance_threshold": -6.079661551191412e-08, + "compression/movement_sparsity/linear_layer_sparsity": 0.924609748226447, + "compression/movement_sparsity/model_sparsity": 0.8928465589425806, + "compression_loss": 105.53014373779297, + "distillation_loss": 5.787306785583496, + "epoch": 4.94, + "learning_rate": 2.8111205034281958e-05, + "loss": 109.4694, + "step": 5844, + "task_loss": 4.669091701507568 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999916712731723, + "compression/movement_sparsity/importance_threshold": -5.833219408543644e-08, + "compression/movement_sparsity/linear_layer_sparsity": 0.924622519009985, + "compression/movement_sparsity/model_sparsity": 0.8928588910104165, + "compression_loss": 105.5296401977539, + "distillation_loss": 5.608484745025635, + "epoch": 4.94, + "learning_rate": 2.8106508875739644e-05, + "loss": 109.8145, + "step": 5845, + "task_loss": 2.7302603721618652 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999920135059059, + "compression/movement_sparsity/importance_threshold": -5.5935286773049764e-08, + "compression/movement_sparsity/linear_layer_sparsity": 0.9246560736177123, + "compression/movement_sparsity/model_sparsity": 0.8928912929141424, + "compression_loss": 105.52909851074219, + "distillation_loss": 4.4703288078308105, + "epoch": 4.94, + "learning_rate": 2.8101812717197334e-05, + "loss": 109.4073, + "step": 5846, + "task_loss": 2.8261890411376953 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999923462327946, + "compression/movement_sparsity/importance_threshold": -5.360495587691805e-08, + "compression/movement_sparsity/linear_layer_sparsity": 0.9247058212450892, + "compression/movement_sparsity/model_sparsity": 0.8929393315574772, + "compression_loss": 105.52857971191406, + "distillation_loss": 4.259686470031738, + "epoch": 4.94, + "learning_rate": 2.8097116558655023e-05, + "loss": 109.172, + "step": 5847, + "task_loss": 1.9527904987335205 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999926695877234, + "compression/movement_sparsity/importance_threshold": -5.1340263703542055e-08, + "compression/movement_sparsity/linear_layer_sparsity": 0.924717387687696, + "compression/movement_sparsity/model_sparsity": 0.8929505006571979, + "compression_loss": 105.5279769897461, + "distillation_loss": 3.723161220550537, + "epoch": 4.94, + "learning_rate": 2.8092420400112706e-05, + "loss": 109.0592, + "step": 5848, + "task_loss": 1.7695951461791992 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999929837045775, + "compression/movement_sparsity/importance_threshold": -4.914027255508574e-08, + "compression/movement_sparsity/linear_layer_sparsity": 0.9246842385016683, + "compression/movement_sparsity/model_sparsity": 0.8929184902476891, + "compression_loss": 105.52749633789062, + "distillation_loss": 3.7978339195251465, + "epoch": 4.94, + "learning_rate": 2.8087724241570396e-05, + "loss": 109.3704, + "step": 5849, + "task_loss": 1.4150173664093018 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.999993288717242, + "compression/movement_sparsity/importance_threshold": -4.700404473631514e-08, + "compression/movement_sparsity/linear_layer_sparsity": 0.924745349860802, + "compression/movement_sparsity/model_sparsity": 0.892977502243636, + "compression_loss": 105.5269546508789, + "distillation_loss": 4.235128402709961, + "epoch": 4.94, + "learning_rate": 2.8083028083028086e-05, + "loss": 109.7843, + "step": 5850, + "task_loss": 2.5113155841827393 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999935847596021, + "compression/movement_sparsity/importance_threshold": -4.493064255112894e-08, + "compression/movement_sparsity/linear_layer_sparsity": 0.9247028163548449, + "compression/movement_sparsity/model_sparsity": 0.892936429894457, + "compression_loss": 105.52645874023438, + "distillation_loss": 3.549377679824829, + "epoch": 4.95, + "learning_rate": 2.8078331924485772e-05, + "loss": 109.5383, + "step": 5851, + "task_loss": 2.607452869415283 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999938719655429, + "compression/movement_sparsity/importance_threshold": -4.291912830255845e-08, + "compression/movement_sparsity/linear_layer_sparsity": 0.9247228489564732, + "compression/movement_sparsity/model_sparsity": 0.8929557743145918, + "compression_loss": 105.52588653564453, + "distillation_loss": 4.412120819091797, + "epoch": 4.95, + "learning_rate": 2.8073635765943462e-05, + "loss": 109.8058, + "step": 5852, + "task_loss": 1.9534404277801514 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999941504689496, + "compression/movement_sparsity/importance_threshold": -4.0968564296237076e-08, + "compression/movement_sparsity/linear_layer_sparsity": 0.9247458029791722, + "compression/movement_sparsity/model_sparsity": 0.8929779397959963, + "compression_loss": 105.52539825439453, + "distillation_loss": 2.6091818809509277, + "epoch": 4.95, + "learning_rate": 2.8068939607401145e-05, + "loss": 108.4475, + "step": 5853, + "task_loss": 0.8372893333435059 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999944204037072, + "compression/movement_sparsity/importance_threshold": -3.907801283519613e-08, + "compression/movement_sparsity/linear_layer_sparsity": 0.9247926053371429, + "compression/movement_sparsity/model_sparsity": 0.8930231343489898, + "compression_loss": 105.52485656738281, + "distillation_loss": 5.425722122192383, + "epoch": 4.95, + "learning_rate": 2.8064243448858834e-05, + "loss": 109.4549, + "step": 5854, + "task_loss": 2.500831127166748 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999946819037011, + "compression/movement_sparsity/importance_threshold": -3.724653622246693e-08, + "compression/movement_sparsity/linear_layer_sparsity": 0.9247769369808695, + "compression/movement_sparsity/model_sparsity": 0.8930080042489559, + "compression_loss": 105.52434539794922, + "distillation_loss": 3.830941915512085, + "epoch": 4.95, + "learning_rate": 2.8059547290316524e-05, + "loss": 108.8586, + "step": 5855, + "task_loss": 1.867904782295227 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999949351028162, + "compression/movement_sparsity/importance_threshold": -3.547319676368288e-08, + "compression/movement_sparsity/linear_layer_sparsity": 0.9247463514908835, + "compression/movement_sparsity/model_sparsity": 0.8929784694646429, + "compression_loss": 105.52375030517578, + "distillation_loss": 3.827970027923584, + "epoch": 4.95, + "learning_rate": 2.8054851131774214e-05, + "loss": 108.9452, + "step": 5856, + "task_loss": 2.050349235534668 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999951801349377, + "compression/movement_sparsity/importance_threshold": -3.3757056762742654e-08, + "compression/movement_sparsity/linear_layer_sparsity": 0.924760314691185, + "compression/movement_sparsity/model_sparsity": 0.8929919529860583, + "compression_loss": 105.52322387695312, + "distillation_loss": 2.530643939971924, + "epoch": 4.95, + "learning_rate": 2.8050154973231897e-05, + "loss": 109.7302, + "step": 5857, + "task_loss": 1.3130748271942139 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999954171339508, + "compression/movement_sparsity/importance_threshold": -3.209717852267757e-08, + "compression/movement_sparsity/linear_layer_sparsity": 0.9247861305141167, + "compression/movement_sparsity/model_sparsity": 0.8930168819560534, + "compression_loss": 105.52264404296875, + "distillation_loss": 3.407151222229004, + "epoch": 4.95, + "learning_rate": 2.8045458814689583e-05, + "loss": 108.642, + "step": 5858, + "task_loss": 1.5685487985610962 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999956462337406, + "compression/movement_sparsity/importance_threshold": -3.049262434738631e-08, + "compression/movement_sparsity/linear_layer_sparsity": 0.9247675645851077, + "compression/movement_sparsity/model_sparsity": 0.8929989538238213, + "compression_loss": 105.52210235595703, + "distillation_loss": 4.470716953277588, + "epoch": 4.95, + "learning_rate": 2.8040762656147273e-05, + "loss": 109.5695, + "step": 5859, + "task_loss": 2.057111978530884 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999958675681923, + "compression/movement_sparsity/importance_threshold": -2.894245654163491e-08, + "compression/movement_sparsity/linear_layer_sparsity": 0.924785462760729, + "compression/movement_sparsity/model_sparsity": 0.8930162371420489, + "compression_loss": 105.52153778076172, + "distillation_loss": 5.06801176071167, + "epoch": 4.95, + "learning_rate": 2.8036066497604963e-05, + "loss": 109.2933, + "step": 5860, + "task_loss": 2.635833501815796 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999960812711909, + "compression/movement_sparsity/importance_threshold": -2.7445737409322046e-08, + "compression/movement_sparsity/linear_layer_sparsity": 0.9247773900992396, + "compression/movement_sparsity/model_sparsity": 0.893008441801316, + "compression_loss": 105.52100372314453, + "distillation_loss": 4.725068092346191, + "epoch": 4.95, + "learning_rate": 2.8031370339062646e-05, + "loss": 109.5713, + "step": 5861, + "task_loss": 3.1156582832336426 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999962874766216, + "compression/movement_sparsity/importance_threshold": -2.600152925347904e-08, + "compression/movement_sparsity/linear_layer_sparsity": 0.9247800849611253, + "compression/movement_sparsity/model_sparsity": 0.8930110440864056, + "compression_loss": 105.52043151855469, + "distillation_loss": 3.3206117153167725, + "epoch": 4.95, + "learning_rate": 2.8026674180520335e-05, + "loss": 109.3429, + "step": 5862, + "task_loss": 1.3587496280670166 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999964863183696, + "compression/movement_sparsity/importance_threshold": -2.4608894379739288e-08, + "compression/movement_sparsity/linear_layer_sparsity": 0.9248166444590968, + "compression/movement_sparsity/model_sparsity": 0.8930463476531516, + "compression_loss": 105.51992797851562, + "distillation_loss": 2.6819381713867188, + "epoch": 4.96, + "learning_rate": 2.8021978021978025e-05, + "loss": 109.293, + "step": 5863, + "task_loss": 2.0624547004699707 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.99999667793032, + "compression/movement_sparsity/importance_threshold": -2.326689509113411e-08, + "compression/movement_sparsity/linear_layer_sparsity": 0.9248115170670135, + "compression/movement_sparsity/model_sparsity": 0.89304139640276, + "compression_loss": 105.51939392089844, + "distillation_loss": 4.075498580932617, + "epoch": 4.96, + "learning_rate": 2.801728186343571e-05, + "loss": 109.0327, + "step": 5864, + "task_loss": 1.5036287307739258 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.999996862446358, + "compression/movement_sparsity/importance_threshold": -2.197459369069482e-08, + "compression/movement_sparsity/linear_layer_sparsity": 0.9247899939444307, + "compression/movement_sparsity/model_sparsity": 0.8930206126656508, + "compression_loss": 105.51885986328125, + "distillation_loss": 3.6778221130371094, + "epoch": 4.96, + "learning_rate": 2.8012585704893394e-05, + "loss": 109.2772, + "step": 5865, + "task_loss": 3.221446990966797 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999970400003686, + "compression/movement_sparsity/importance_threshold": -2.0731052484054824e-08, + "compression/movement_sparsity/linear_layer_sparsity": 0.9247336522523513, + "compression/movement_sparsity/model_sparsity": 0.8929662064840217, + "compression_loss": 105.5183334350586, + "distillation_loss": 4.524487495422363, + "epoch": 4.96, + "learning_rate": 2.8007889546351084e-05, + "loss": 110.2112, + "step": 5866, + "task_loss": 1.9631383419036865 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999972107262372, + "compression/movement_sparsity/importance_threshold": -1.9535333774245434e-08, + "compression/movement_sparsity/linear_layer_sparsity": 0.9247798822502755, + "compression/movement_sparsity/model_sparsity": 0.8930108483392971, + "compression_loss": 105.51778411865234, + "distillation_loss": 3.589909076690674, + "epoch": 4.96, + "learning_rate": 2.8003193387808774e-05, + "loss": 109.4291, + "step": 5867, + "task_loss": 2.675635814666748 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999973747578486, + "compression/movement_sparsity/importance_threshold": -1.838649986603269e-08, + "compression/movement_sparsity/linear_layer_sparsity": 0.9248069739591441, + "compression/movement_sparsity/model_sparsity": 0.8930370093646223, + "compression_loss": 105.51721954345703, + "distillation_loss": 4.056374549865723, + "epoch": 4.96, + "learning_rate": 2.7998497229266464e-05, + "loss": 110.3129, + "step": 5868, + "task_loss": 2.75839900970459 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999975322290882, + "compression/movement_sparsity/importance_threshold": -1.728361306244791e-08, + "compression/movement_sparsity/linear_layer_sparsity": 0.9248404093251952, + "compression/movement_sparsity/model_sparsity": 0.8930692961229901, + "compression_loss": 105.5167236328125, + "distillation_loss": 4.368237495422363, + "epoch": 4.96, + "learning_rate": 2.7993801070724153e-05, + "loss": 109.1265, + "step": 5869, + "task_loss": 2.9064948558807373 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999976832738409, + "compression/movement_sparsity/importance_threshold": -1.6225735668257135e-08, + "compression/movement_sparsity/linear_layer_sparsity": 0.9249473214122182, + "compression/movement_sparsity/model_sparsity": 0.893172535450924, + "compression_loss": 105.51616668701172, + "distillation_loss": 4.509244918823242, + "epoch": 4.96, + "learning_rate": 2.7989104912181836e-05, + "loss": 109.1984, + "step": 5870, + "task_loss": 2.7013630867004395 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999978280259921, + "compression/movement_sparsity/importance_threshold": -1.521192998649168e-08, + "compression/movement_sparsity/linear_layer_sparsity": 0.9249166286047236, + "compression/movement_sparsity/model_sparsity": 0.8931428970357889, + "compression_loss": 105.51564025878906, + "distillation_loss": 3.6194417476654053, + "epoch": 4.96, + "learning_rate": 2.7984408753639523e-05, + "loss": 108.8022, + "step": 5871, + "task_loss": 1.9047770500183105 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999979666194269, + "compression/movement_sparsity/importance_threshold": -1.4241258321917583e-08, + "compression/movement_sparsity/linear_layer_sparsity": 0.9249165093630473, + "compression/movement_sparsity/model_sparsity": 0.893142781890431, + "compression_loss": 105.51509857177734, + "distillation_loss": 4.460231781005859, + "epoch": 4.96, + "learning_rate": 2.7979712595097212e-05, + "loss": 109.9072, + "step": 5872, + "task_loss": 2.05206561088562 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999980991880303, + "compression/movement_sparsity/importance_threshold": -1.3312782977566162e-08, + "compression/movement_sparsity/linear_layer_sparsity": 0.924910261099206, + "compression/movement_sparsity/model_sparsity": 0.8931367482736746, + "compression_loss": 105.51454162597656, + "distillation_loss": 3.354055643081665, + "epoch": 4.96, + "learning_rate": 2.7975016436554902e-05, + "loss": 108.8324, + "step": 5873, + "task_loss": 1.7151175737380981 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999982258656874, + "compression/movement_sparsity/importance_threshold": -1.2425566259938181e-08, + "compression/movement_sparsity/linear_layer_sparsity": 0.9249130752027681, + "compression/movement_sparsity/model_sparsity": 0.8931394657041221, + "compression_loss": 105.513916015625, + "distillation_loss": 2.9095869064331055, + "epoch": 4.96, + "learning_rate": 2.7970320278012585e-05, + "loss": 108.7118, + "step": 5874, + "task_loss": 1.9037086963653564 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999983467862836, + "compression/movement_sparsity/importance_threshold": -1.157867046946287e-08, + "compression/movement_sparsity/linear_layer_sparsity": 0.9249651003461633, + "compression/movement_sparsity/model_sparsity": 0.8931897036237937, + "compression_loss": 105.51336669921875, + "distillation_loss": 4.811957836151123, + "epoch": 4.97, + "learning_rate": 2.7965624119470275e-05, + "loss": 109.5832, + "step": 5875, + "task_loss": 2.759206771850586 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999984620837039, + "compression/movement_sparsity/importance_threshold": -1.0771157912640994e-08, + "compression/movement_sparsity/linear_layer_sparsity": 0.9249788846839504, + "compression/movement_sparsity/model_sparsity": 0.8932030144271722, + "compression_loss": 105.51270294189453, + "distillation_loss": 3.9048945903778076, + "epoch": 4.97, + "learning_rate": 2.7960927960927964e-05, + "loss": 109.1663, + "step": 5876, + "task_loss": 1.7396193742752075 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999985718918334, + "compression/movement_sparsity/importance_threshold": -1.0002090892503868e-08, + "compression/movement_sparsity/linear_layer_sparsity": 0.925012463140013, + "compression/movement_sparsity/model_sparsity": 0.8932354393599696, + "compression_loss": 105.51212310791016, + "distillation_loss": 3.7623343467712402, + "epoch": 4.97, + "learning_rate": 2.795623180238565e-05, + "loss": 109.2077, + "step": 5877, + "task_loss": 1.5687006711959839 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999986763445573, + "compression/movement_sparsity/importance_threshold": -9.27053171295017e-09, + "compression/movement_sparsity/linear_layer_sparsity": 0.9249954831252996, + "compression/movement_sparsity/model_sparsity": 0.8932190426609982, + "compression_loss": 105.51154327392578, + "distillation_loss": 4.212191104888916, + "epoch": 4.97, + "learning_rate": 2.7951535643843334e-05, + "loss": 109.4948, + "step": 5878, + "task_loss": 1.865241289138794 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999987755757607, + "compression/movement_sparsity/importance_threshold": -8.57554267874594e-09, + "compression/movement_sparsity/linear_layer_sparsity": 0.924959054793172, + "compression/movement_sparsity/model_sparsity": 0.8931838657541459, + "compression_loss": 105.51097106933594, + "distillation_loss": 4.179649353027344, + "epoch": 4.97, + "learning_rate": 2.7946839485301023e-05, + "loss": 109.6506, + "step": 5879, + "task_loss": 3.6693124771118164 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999988697193287, + "compression/movement_sparsity/importance_threshold": -7.916186092922495e-09, + "compression/movement_sparsity/linear_layer_sparsity": 0.9249164974388796, + "compression/movement_sparsity/model_sparsity": 0.8931427703758952, + "compression_loss": 105.51036834716797, + "distillation_loss": 3.6689977645874023, + "epoch": 4.97, + "learning_rate": 2.7942143326758713e-05, + "loss": 109.9064, + "step": 5880, + "task_loss": 1.5843734741210938 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999989589091466, + "compression/movement_sparsity/importance_threshold": -7.291524260245874e-09, + "compression/movement_sparsity/linear_layer_sparsity": 0.9249577192863968, + "compression/movement_sparsity/model_sparsity": 0.8931825761261368, + "compression_loss": 105.50984954833984, + "distillation_loss": 3.979867458343506, + "epoch": 4.97, + "learning_rate": 2.7937447168216403e-05, + "loss": 109.0381, + "step": 5881, + "task_loss": 2.6386313438415527 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999990432790993, + "compression/movement_sparsity/importance_threshold": -6.700619484614756e-09, + "compression/movement_sparsity/linear_layer_sparsity": 0.9249835589576637, + "compression/movement_sparsity/model_sparsity": 0.8932075281252037, + "compression_loss": 105.50929260253906, + "distillation_loss": 3.2594900131225586, + "epoch": 4.97, + "learning_rate": 2.7932751009674086e-05, + "loss": 109.1922, + "step": 5882, + "task_loss": 1.8288002014160156 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999991229630721, + "compression/movement_sparsity/importance_threshold": -6.1425340699278186e-09, + "compression/movement_sparsity/linear_layer_sparsity": 0.924989211013123, + "compression/movement_sparsity/model_sparsity": 0.8932129860151702, + "compression_loss": 105.50875854492188, + "distillation_loss": 4.670947551727295, + "epoch": 4.97, + "learning_rate": 2.7928054851131776e-05, + "loss": 109.7989, + "step": 5883, + "task_loss": 3.5299787521362305 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999991980949503, + "compression/movement_sparsity/importance_threshold": -5.6163303183490165e-09, + "compression/movement_sparsity/linear_layer_sparsity": 0.9249779426747072, + "compression/movement_sparsity/model_sparsity": 0.8932021047788444, + "compression_loss": 105.50823211669922, + "distillation_loss": 3.950338840484619, + "epoch": 4.97, + "learning_rate": 2.7923358692589462e-05, + "loss": 109.2451, + "step": 5884, + "task_loss": 2.941614866256714 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999992688086187, + "compression/movement_sparsity/importance_threshold": -5.121070537246475e-09, + "compression/movement_sparsity/linear_layer_sparsity": 0.9249505886341505, + "compression/movement_sparsity/model_sparsity": 0.8931756904337317, + "compression_loss": 105.50763702392578, + "distillation_loss": 4.852612018585205, + "epoch": 4.97, + "learning_rate": 2.791866253404715e-05, + "loss": 109.0179, + "step": 5885, + "task_loss": 1.847052812576294 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999993352379626, + "compression/movement_sparsity/importance_threshold": -4.6558170279167865e-09, + "compression/movement_sparsity/linear_layer_sparsity": 0.9249317722976211, + "compression/movement_sparsity/model_sparsity": 0.893157520496248, + "compression_loss": 105.50706481933594, + "distillation_loss": 5.089334487915039, + "epoch": 4.97, + "learning_rate": 2.791396637550484e-05, + "loss": 109.5355, + "step": 5886, + "task_loss": 2.56630277633667 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999993975168672, + "compression/movement_sparsity/importance_threshold": -4.219632095993353e-09, + "compression/movement_sparsity/linear_layer_sparsity": 0.9249224595226975, + "compression/movement_sparsity/model_sparsity": 0.8931485276437924, + "compression_loss": 105.50646209716797, + "distillation_loss": 3.3884177207946777, + "epoch": 4.98, + "learning_rate": 2.7909270216962524e-05, + "loss": 108.9685, + "step": 5887, + "task_loss": 2.3193747997283936 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999994557792176, + "compression/movement_sparsity/importance_threshold": -3.811578043640129e-09, + "compression/movement_sparsity/linear_layer_sparsity": 0.9249176898556432, + "compression/movement_sparsity/model_sparsity": 0.8931439218294746, + "compression_loss": 105.50586700439453, + "distillation_loss": 4.13754940032959, + "epoch": 4.98, + "learning_rate": 2.7904574058420214e-05, + "loss": 109.9003, + "step": 5888, + "task_loss": 2.5789523124694824 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999995101588989, + "compression/movement_sparsity/importance_threshold": -3.4307171756231547e-09, + "compression/movement_sparsity/linear_layer_sparsity": 0.9249341213586454, + "compression/movement_sparsity/model_sparsity": 0.8931597888597994, + "compression_loss": 105.50529479980469, + "distillation_loss": 4.677050590515137, + "epoch": 4.98, + "learning_rate": 2.78998778998779e-05, + "loss": 109.0332, + "step": 5889, + "task_loss": 2.284461736679077 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999995607897962, + "compression/movement_sparsity/importance_threshold": -3.076111797575831e-09, + "compression/movement_sparsity/linear_layer_sparsity": 0.9249663285354298, + "compression/movement_sparsity/model_sparsity": 0.8931908896209805, + "compression_loss": 105.50463104248047, + "distillation_loss": 3.4984352588653564, + "epoch": 4.98, + "learning_rate": 2.789518174133559e-05, + "loss": 109.745, + "step": 5890, + "task_loss": 2.1903717517852783 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999996078057948, + "compression/movement_sparsity/importance_threshold": -2.7468242107947516e-09, + "compression/movement_sparsity/linear_layer_sparsity": 0.925006799160386, + "compression/movement_sparsity/model_sparsity": 0.8932299699554672, + "compression_loss": 105.5040283203125, + "distillation_loss": 1.9705235958099365, + "epoch": 4.98, + "learning_rate": 2.7890485582793273e-05, + "loss": 108.7467, + "step": 5891, + "task_loss": 0.6571514010429382 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999996513407796, + "compression/movement_sparsity/importance_threshold": -2.441916721780679e-09, + "compression/movement_sparsity/linear_layer_sparsity": 0.9250161119353095, + "compression/movement_sparsity/model_sparsity": 0.8932389628079227, + "compression_loss": 105.50341796875, + "distillation_loss": 3.0539584159851074, + "epoch": 4.98, + "learning_rate": 2.7885789424250963e-05, + "loss": 109.6007, + "step": 5892, + "task_loss": 1.979411005973816 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.999999691528636, + "compression/movement_sparsity/importance_threshold": -2.1604516326975687e-09, + "compression/movement_sparsity/linear_layer_sparsity": 0.9250263428711412, + "compression/movement_sparsity/model_sparsity": 0.8932488422796344, + "compression_loss": 105.50274658203125, + "distillation_loss": 2.814729690551758, + "epoch": 4.98, + "learning_rate": 2.7881093265708653e-05, + "loss": 109.698, + "step": 5893, + "task_loss": 1.5163878202438354 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.999999728503249, + "compression/movement_sparsity/importance_threshold": -1.90149124831146e-09, + "compression/movement_sparsity/linear_layer_sparsity": 0.9249927763392461, + "compression/movement_sparsity/model_sparsity": 0.8932164288613728, + "compression_loss": 105.50222778320312, + "distillation_loss": 2.719080924987793, + "epoch": 4.98, + "learning_rate": 2.7876397107166342e-05, + "loss": 108.7807, + "step": 5894, + "task_loss": 1.2525080442428589 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999997623985037, + "compression/movement_sparsity/importance_threshold": -1.664097872521031e-09, + "compression/movement_sparsity/linear_layer_sparsity": 0.9250131905142388, + "compression/movement_sparsity/model_sparsity": 0.8932361417466531, + "compression_loss": 105.5015640258789, + "distillation_loss": 3.8779735565185547, + "epoch": 4.98, + "learning_rate": 2.7871700948624025e-05, + "loss": 108.9899, + "step": 5895, + "task_loss": 2.3458926677703857 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999997933482855, + "compression/movement_sparsity/importance_threshold": -1.4473338083575982e-09, + "compression/movement_sparsity/linear_layer_sparsity": 0.9250223721233184, + "compression/movement_sparsity/model_sparsity": 0.8932450079392149, + "compression_loss": 105.5009765625, + "distillation_loss": 5.491689682006836, + "epoch": 4.98, + "learning_rate": 2.7867004790081715e-05, + "loss": 109.5036, + "step": 5896, + "task_loss": 2.5188772678375244 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999998214864791, + "compression/movement_sparsity/importance_threshold": -1.2502613614545632e-09, + "compression/movement_sparsity/linear_layer_sparsity": 0.9250948472142092, + "compression/movement_sparsity/model_sparsity": 0.8933149932877741, + "compression_loss": 105.50039672851562, + "distillation_loss": 4.8426384925842285, + "epoch": 4.98, + "learning_rate": 2.78623086315394e-05, + "loss": 109.679, + "step": 5897, + "task_loss": 2.037792921066284 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999998469469701, + "compression/movement_sparsity/importance_threshold": -1.0719428348432425e-09, + "compression/movement_sparsity/linear_layer_sparsity": 0.9250792623271091, + "compression/movement_sparsity/model_sparsity": 0.8932999437894906, + "compression_loss": 105.499755859375, + "distillation_loss": 2.8470168113708496, + "epoch": 4.99, + "learning_rate": 2.785761247299709e-05, + "loss": 109.256, + "step": 5898, + "task_loss": 1.5510737895965576 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999998698636433, + "compression/movement_sparsity/importance_threshold": -9.11440532422314e-10, + "compression/movement_sparsity/linear_layer_sparsity": 0.9250863214343495, + "compression/movement_sparsity/model_sparsity": 0.8933067603946809, + "compression_loss": 105.4991226196289, + "distillation_loss": 3.4297666549682617, + "epoch": 4.99, + "learning_rate": 2.785291631445478e-05, + "loss": 109.2113, + "step": 5899, + "task_loss": 3.621725082397461 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999998903703841, + "compression/movement_sparsity/importance_threshold": -7.67816758090456e-10, + "compression/movement_sparsity/linear_layer_sparsity": 0.9251470035234485, + "compression/movement_sparsity/model_sparsity": 0.8933653578673394, + "compression_loss": 105.49853515625, + "distillation_loss": 4.156139373779297, + "epoch": 4.99, + "learning_rate": 2.7848220155912464e-05, + "loss": 109.5652, + "step": 5900, + "task_loss": 3.342054843902588 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999999086010773, + "compression/movement_sparsity/importance_threshold": -6.4013381748107e-10, + "compression/movement_sparsity/linear_layer_sparsity": 0.9251133773707153, + "compression/movement_sparsity/model_sparsity": 0.8933328868763988, + "compression_loss": 105.49789428710938, + "distillation_loss": 2.710592746734619, + "epoch": 4.99, + "learning_rate": 2.7843523997370153e-05, + "loss": 108.9363, + "step": 5901, + "task_loss": 1.3197705745697021 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999999246896084, + "compression/movement_sparsity/importance_threshold": -5.274540118907489e-10, + "compression/movement_sparsity/linear_layer_sparsity": 0.9250882293011713, + "compression/movement_sparsity/model_sparsity": 0.8933086027204081, + "compression_loss": 105.49723052978516, + "distillation_loss": 4.235291481018066, + "epoch": 4.99, + "learning_rate": 2.783882783882784e-05, + "loss": 109.3602, + "step": 5902, + "task_loss": 2.227602243423462 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999999387698624, + "compression/movement_sparsity/importance_threshold": -4.2883964695289434e-10, + "compression/movement_sparsity/linear_layer_sparsity": 0.92510654482266, + "compression/movement_sparsity/model_sparsity": 0.8933262890473885, + "compression_loss": 105.49665069580078, + "distillation_loss": 4.263698577880859, + "epoch": 4.99, + "learning_rate": 2.783413168028553e-05, + "loss": 109.1425, + "step": 5903, + "task_loss": 2.08604097366333 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999999509757244, + "compression/movement_sparsity/importance_threshold": -3.433530265661844e-10, + "compression/movement_sparsity/linear_layer_sparsity": 0.9251375715068485, + "compression/movement_sparsity/model_sparsity": 0.8933562498695259, + "compression_loss": 105.49600219726562, + "distillation_loss": 5.044488906860352, + "epoch": 4.99, + "learning_rate": 2.7829435521743212e-05, + "loss": 108.9655, + "step": 5904, + "task_loss": 3.1151111125946045 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999999614410795, + "compression/movement_sparsity/importance_threshold": -2.7005645376193543e-10, + "compression/movement_sparsity/linear_layer_sparsity": 0.92512212970976, + "compression/movement_sparsity/model_sparsity": 0.893341338545672, + "compression_loss": 105.495361328125, + "distillation_loss": 3.8421998023986816, + "epoch": 4.99, + "learning_rate": 2.7824739363200902e-05, + "loss": 109.0103, + "step": 5905, + "task_loss": 2.1294994354248047 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.999999970299813, + "compression/movement_sparsity/importance_threshold": -2.0801223330618734e-10, + "compression/movement_sparsity/linear_layer_sparsity": 0.9251399324920404, + "compression/movement_sparsity/model_sparsity": 0.8933585297476132, + "compression_loss": 105.4947738647461, + "distillation_loss": 3.687211036682129, + "epoch": 4.99, + "learning_rate": 2.7820043204658592e-05, + "loss": 109.3556, + "step": 5906, + "task_loss": 1.873538613319397 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.99999997768581, + "compression/movement_sparsity/importance_threshold": -1.5628266996497997e-10, + "compression/movement_sparsity/linear_layer_sparsity": 0.9251245264674548, + "compression/movement_sparsity/model_sparsity": 0.8933436529673667, + "compression_loss": 105.4941177368164, + "distillation_loss": 5.711050510406494, + "epoch": 4.99, + "learning_rate": 2.781534704611628e-05, + "loss": 110.006, + "step": 5907, + "task_loss": 3.399139642715454 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999999837329554, + "compression/movement_sparsity/importance_threshold": -1.1393006676962969e-10, + "compression/movement_sparsity/linear_layer_sparsity": 0.9252315577961544, + "compression/movement_sparsity/model_sparsity": 0.8934470074406585, + "compression_loss": 105.49349975585938, + "distillation_loss": 5.6282243728637695, + "epoch": 4.99, + "learning_rate": 2.7810650887573965e-05, + "loss": 110.0112, + "step": 5908, + "task_loss": 3.2896342277526855 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999999885751346, + "compression/movement_sparsity/importance_threshold": -8.001672761881462e-11, + "compression/movement_sparsity/linear_layer_sparsity": 0.925334952253725, + "compression/movement_sparsity/model_sparsity": 0.8935468499805329, + "compression_loss": 105.492919921875, + "distillation_loss": 4.258896827697754, + "epoch": 4.99, + "learning_rate": 2.780595472903165e-05, + "loss": 109.4165, + "step": 5909, + "task_loss": 2.5525002479553223 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999999923462328, + "compression/movement_sparsity/importance_threshold": -5.360495554385114e-11, + "compression/movement_sparsity/linear_layer_sparsity": 0.9253391495607327, + "compression/movement_sparsity/model_sparsity": 0.8935509030971326, + "compression_loss": 105.4923095703125, + "distillation_loss": 3.679643154144287, + "epoch": 5.0, + "learning_rate": 2.780125857048934e-05, + "loss": 109.1786, + "step": 5910, + "task_loss": 1.9977991580963135 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999999951801349, + "compression/movement_sparsity/importance_threshold": -3.375705704550258e-11, + "compression/movement_sparsity/linear_layer_sparsity": 0.9253683399231053, + "compression/movement_sparsity/model_sparsity": 0.8935790906807577, + "compression_loss": 105.49173736572266, + "distillation_loss": 3.327390670776367, + "epoch": 5.0, + "learning_rate": 2.779656241194703e-05, + "loss": 109.316, + "step": 5911, + "task_loss": 2.482755661010742 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999999972107262, + "compression/movement_sparsity/importance_threshold": -1.9535334287723582e-11, + "compression/movement_sparsity/linear_layer_sparsity": 0.9253979237830099, + "compression/movement_sparsity/model_sparsity": 0.893607658244064, + "compression_loss": 105.49112701416016, + "distillation_loss": 5.1300048828125, + "epoch": 5.0, + "learning_rate": 2.7791866253404713e-05, + "loss": 110.1546, + "step": 5912, + "task_loss": 2.7778573036193848 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999999985718918, + "compression/movement_sparsity/importance_threshold": -1.0002091169192262e-11, + "compression/movement_sparsity/linear_layer_sparsity": 0.9253641783886004, + "compression/movement_sparsity/model_sparsity": 0.8935750721077653, + "compression_loss": 105.49063110351562, + "distillation_loss": 4.768748760223389, + "epoch": 5.0, + "learning_rate": 2.7787170094862403e-05, + "loss": 109.218, + "step": 5913, + "task_loss": 2.8526062965393066 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999999993975168, + "compression/movement_sparsity/importance_threshold": -4.219632455948474e-12, + "compression/movement_sparsity/linear_layer_sparsity": 0.9253416774842715, + "compression/movement_sparsity/model_sparsity": 0.8935533441787211, + "compression_loss": 105.49003601074219, + "distillation_loss": 4.018354415893555, + "epoch": 5.0, + "learning_rate": 2.7782473936320093e-05, + "loss": 109.3875, + "step": 5914, + "task_loss": 2.262068271636963 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 0.9999999998214865, + "compression/movement_sparsity/importance_threshold": -1.2502611793085983e-12, + "compression/movement_sparsity/linear_layer_sparsity": 0.9254309775756965, + "compression/movement_sparsity/model_sparsity": 0.8936395765372864, + "compression_loss": 105.48946380615234, + "distillation_loss": 3.139014720916748, + "epoch": 5.0, + "learning_rate": 2.777777777777778e-05, + "loss": 108.7148, + "step": 5915, + "task_loss": 1.5968846082687378 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 9.107519149780273, + "epoch": 5.0, + "learning_rate": 2.777308161923547e-05, + "loss": 89.7272, + "step": 5916, + "task_loss": 3.68790864944458 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 5.998353004455566, + "epoch": 5.0, + "learning_rate": 2.7768385460693152e-05, + "loss": 7.0484, + "step": 5917, + "task_loss": 3.254852771759033 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 5.807075500488281, + "epoch": 5.0, + "learning_rate": 2.776368930215084e-05, + "loss": 5.5135, + "step": 5918, + "task_loss": 3.0269157886505127 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 4.923386096954346, + "epoch": 5.0, + "learning_rate": 2.775899314360853e-05, + "loss": 5.2188, + "step": 5919, + "task_loss": 1.4955283403396606 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 4.83468770980835, + "epoch": 5.0, + "learning_rate": 2.775429698506622e-05, + "loss": 4.3778, + "step": 5920, + "task_loss": 2.905348062515259 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 4.057333469390869, + "epoch": 5.01, + "learning_rate": 2.7749600826523904e-05, + "loss": 4.1993, + "step": 5921, + "task_loss": 1.8149784803390503 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 4.103086948394775, + "epoch": 5.01, + "learning_rate": 2.774490466798159e-05, + "loss": 3.5052, + "step": 5922, + "task_loss": 2.137714385986328 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 2.798582077026367, + "epoch": 5.01, + "learning_rate": 2.774020850943928e-05, + "loss": 3.6085, + "step": 5923, + "task_loss": 1.8862818479537964 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 3.7657337188720703, + "epoch": 5.01, + "learning_rate": 2.773551235089697e-05, + "loss": 3.7961, + "step": 5924, + "task_loss": 2.2626469135284424 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 3.1489675045013428, + "epoch": 5.01, + "learning_rate": 2.7730816192354653e-05, + "loss": 2.9824, + "step": 5925, + "task_loss": 1.7105740308761597 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.6984586715698242, + "epoch": 5.01, + "learning_rate": 2.7726120033812342e-05, + "loss": 2.7759, + "step": 5926, + "task_loss": 1.1583797931671143 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 2.3696179389953613, + "epoch": 5.01, + "learning_rate": 2.7721423875270032e-05, + "loss": 2.3709, + "step": 5927, + "task_loss": 2.0790762901306152 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 2.175184726715088, + "epoch": 5.01, + "learning_rate": 2.771672771672772e-05, + "loss": 2.3906, + "step": 5928, + "task_loss": 1.166720986366272 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 3.792428970336914, + "epoch": 5.01, + "learning_rate": 2.7712031558185408e-05, + "loss": 2.5864, + "step": 5929, + "task_loss": 2.224179267883301 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 2.52396821975708, + "epoch": 5.01, + "learning_rate": 2.770733539964309e-05, + "loss": 2.4367, + "step": 5930, + "task_loss": 1.4490300416946411 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 2.095733642578125, + "epoch": 5.01, + "learning_rate": 2.770263924110078e-05, + "loss": 2.6712, + "step": 5931, + "task_loss": 2.071390151977539 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 2.496537685394287, + "epoch": 5.01, + "learning_rate": 2.769794308255847e-05, + "loss": 2.2837, + "step": 5932, + "task_loss": 0.7397240400314331 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 2.4572761058807373, + "epoch": 5.02, + "learning_rate": 2.7693246924016157e-05, + "loss": 2.173, + "step": 5933, + "task_loss": 1.1359903812408447 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 2.441149950027466, + "epoch": 5.02, + "learning_rate": 2.7688550765473843e-05, + "loss": 2.6266, + "step": 5934, + "task_loss": 2.4219510555267334 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 2.9012343883514404, + "epoch": 5.02, + "learning_rate": 2.768385460693153e-05, + "loss": 2.2335, + "step": 5935, + "task_loss": 1.1415892839431763 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.8224756717681885, + "epoch": 5.02, + "learning_rate": 2.767915844838922e-05, + "loss": 2.1349, + "step": 5936, + "task_loss": 0.7082262635231018 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.8892982006072998, + "epoch": 5.02, + "learning_rate": 2.767446228984691e-05, + "loss": 1.9177, + "step": 5937, + "task_loss": 1.2918776273727417 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 2.3299925327301025, + "epoch": 5.02, + "learning_rate": 2.7669766131304592e-05, + "loss": 2.2113, + "step": 5938, + "task_loss": 1.7329392433166504 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.7745351791381836, + "epoch": 5.02, + "learning_rate": 2.7665069972762282e-05, + "loss": 1.8083, + "step": 5939, + "task_loss": 2.4126667976379395 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 2.226346254348755, + "epoch": 5.02, + "learning_rate": 2.7660373814219968e-05, + "loss": 2.322, + "step": 5940, + "task_loss": 1.2096538543701172 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 2.5984599590301514, + "epoch": 5.02, + "learning_rate": 2.7655677655677658e-05, + "loss": 2.3425, + "step": 5941, + "task_loss": 1.8854190111160278 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 2.4422881603240967, + "epoch": 5.02, + "learning_rate": 2.765098149713534e-05, + "loss": 2.066, + "step": 5942, + "task_loss": 2.004627227783203 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 2.118638038635254, + "epoch": 5.02, + "learning_rate": 2.764628533859303e-05, + "loss": 2.2797, + "step": 5943, + "task_loss": 1.4140465259552002 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.347839117050171, + "epoch": 5.02, + "learning_rate": 2.764158918005072e-05, + "loss": 2.062, + "step": 5944, + "task_loss": 0.9017975330352783 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 2.8050596714019775, + "epoch": 5.03, + "learning_rate": 2.763689302150841e-05, + "loss": 2.4213, + "step": 5945, + "task_loss": 1.5031167268753052 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.4305057525634766, + "epoch": 5.03, + "learning_rate": 2.7632196862966096e-05, + "loss": 1.3526, + "step": 5946, + "task_loss": 0.7485719323158264 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.689972162246704, + "epoch": 5.03, + "learning_rate": 2.7627500704423783e-05, + "loss": 2.103, + "step": 5947, + "task_loss": 0.5695490837097168 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 2.046323776245117, + "epoch": 5.03, + "learning_rate": 2.762280454588147e-05, + "loss": 1.4623, + "step": 5948, + "task_loss": 2.094740390777588 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.362403154373169, + "epoch": 5.03, + "learning_rate": 2.761810838733916e-05, + "loss": 1.8346, + "step": 5949, + "task_loss": 1.1634949445724487 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 2.498169422149658, + "epoch": 5.03, + "learning_rate": 2.761341222879685e-05, + "loss": 1.8253, + "step": 5950, + "task_loss": 1.6829357147216797 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.9084558486938477, + "epoch": 5.03, + "learning_rate": 2.760871607025453e-05, + "loss": 2.0019, + "step": 5951, + "task_loss": 1.7362726926803589 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 2.0300230979919434, + "epoch": 5.03, + "learning_rate": 2.760401991171222e-05, + "loss": 1.6678, + "step": 5952, + "task_loss": 1.5908313989639282 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.9563615322113037, + "epoch": 5.03, + "learning_rate": 2.7599323753169907e-05, + "loss": 1.8268, + "step": 5953, + "task_loss": 1.6966561079025269 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 2.2024641036987305, + "epoch": 5.03, + "learning_rate": 2.7594627594627597e-05, + "loss": 1.7741, + "step": 5954, + "task_loss": 1.0555740594863892 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.9592430591583252, + "epoch": 5.03, + "learning_rate": 2.758993143608528e-05, + "loss": 1.8663, + "step": 5955, + "task_loss": 1.6705167293548584 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 2.0182042121887207, + "epoch": 5.03, + "learning_rate": 2.758523527754297e-05, + "loss": 1.8086, + "step": 5956, + "task_loss": 1.6772360801696777 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.3420097827911377, + "epoch": 5.04, + "learning_rate": 2.758053911900066e-05, + "loss": 1.8859, + "step": 5957, + "task_loss": 0.886572003364563 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.7672629356384277, + "epoch": 5.04, + "learning_rate": 2.757584296045835e-05, + "loss": 1.792, + "step": 5958, + "task_loss": 1.9365952014923096 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.8932313919067383, + "epoch": 5.04, + "learning_rate": 2.7571146801916032e-05, + "loss": 2.0352, + "step": 5959, + "task_loss": 2.4446828365325928 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.2710652351379395, + "epoch": 5.04, + "learning_rate": 2.756645064337372e-05, + "loss": 1.574, + "step": 5960, + "task_loss": 0.9964640140533447 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.934610366821289, + "epoch": 5.04, + "learning_rate": 2.756175448483141e-05, + "loss": 1.9456, + "step": 5961, + "task_loss": 0.6763312816619873 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.792356252670288, + "epoch": 5.04, + "learning_rate": 2.7557058326289098e-05, + "loss": 1.7807, + "step": 5962, + "task_loss": 0.8963178396224976 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.4659570455551147, + "epoch": 5.04, + "learning_rate": 2.7552362167746788e-05, + "loss": 1.8935, + "step": 5963, + "task_loss": 1.4169306755065918 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.4013087749481201, + "epoch": 5.04, + "learning_rate": 2.754766600920447e-05, + "loss": 1.3332, + "step": 5964, + "task_loss": 0.8059857487678528 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.3107569217681885, + "epoch": 5.04, + "learning_rate": 2.754296985066216e-05, + "loss": 1.5869, + "step": 5965, + "task_loss": 1.4338014125823975 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 2.1250243186950684, + "epoch": 5.04, + "learning_rate": 2.7538273692119847e-05, + "loss": 2.1255, + "step": 5966, + "task_loss": 1.3717212677001953 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.465025544166565, + "epoch": 5.04, + "learning_rate": 2.7533577533577537e-05, + "loss": 1.7426, + "step": 5967, + "task_loss": 1.0294339656829834 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 2.2154417037963867, + "epoch": 5.04, + "learning_rate": 2.752888137503522e-05, + "loss": 1.9514, + "step": 5968, + "task_loss": 1.8215508460998535 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 2.6885862350463867, + "epoch": 5.05, + "learning_rate": 2.752418521649291e-05, + "loss": 1.6836, + "step": 5969, + "task_loss": 2.9758992195129395 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.6610649824142456, + "epoch": 5.05, + "learning_rate": 2.75194890579506e-05, + "loss": 1.6842, + "step": 5970, + "task_loss": 1.486685037612915 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0938032865524292, + "epoch": 5.05, + "learning_rate": 2.751479289940829e-05, + "loss": 1.3269, + "step": 5971, + "task_loss": 0.9728466272354126 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.8419153690338135, + "epoch": 5.05, + "learning_rate": 2.751009674086597e-05, + "loss": 1.7515, + "step": 5972, + "task_loss": 1.336808204650879 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.1511118412017822, + "epoch": 5.05, + "learning_rate": 2.7505400582323658e-05, + "loss": 1.2678, + "step": 5973, + "task_loss": 0.34537604451179504 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.6735496520996094, + "epoch": 5.05, + "learning_rate": 2.7500704423781348e-05, + "loss": 2.2292, + "step": 5974, + "task_loss": 1.4412600994110107 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 2.815626621246338, + "epoch": 5.05, + "learning_rate": 2.7496008265239037e-05, + "loss": 1.8289, + "step": 5975, + "task_loss": 1.2378538846969604 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.6196995973587036, + "epoch": 5.05, + "learning_rate": 2.7491312106696727e-05, + "loss": 1.5102, + "step": 5976, + "task_loss": 2.0097298622131348 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.7491836547851562, + "epoch": 5.05, + "learning_rate": 2.748661594815441e-05, + "loss": 1.5927, + "step": 5977, + "task_loss": 1.5157792568206787 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.9714813232421875, + "epoch": 5.05, + "learning_rate": 2.74819197896121e-05, + "loss": 1.8374, + "step": 5978, + "task_loss": 1.2396095991134644 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.8989825248718262, + "epoch": 5.05, + "learning_rate": 2.7477223631069786e-05, + "loss": 1.515, + "step": 5979, + "task_loss": 1.158367395401001 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.731797456741333, + "epoch": 5.05, + "learning_rate": 2.7472527472527476e-05, + "loss": 1.1383, + "step": 5980, + "task_loss": 0.4815087914466858 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.4883168935775757, + "epoch": 5.06, + "learning_rate": 2.746783131398516e-05, + "loss": 1.401, + "step": 5981, + "task_loss": 1.5791443586349487 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.933860421180725, + "epoch": 5.06, + "learning_rate": 2.746313515544285e-05, + "loss": 1.5846, + "step": 5982, + "task_loss": 0.7577766180038452 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.3321192264556885, + "epoch": 5.06, + "learning_rate": 2.7458438996900538e-05, + "loss": 1.3451, + "step": 5983, + "task_loss": 0.5090982913970947 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 2.0332798957824707, + "epoch": 5.06, + "learning_rate": 2.7453742838358225e-05, + "loss": 2.1848, + "step": 5984, + "task_loss": 1.3390953540802002 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.4984904527664185, + "epoch": 5.06, + "learning_rate": 2.744904667981591e-05, + "loss": 1.414, + "step": 5985, + "task_loss": 0.9739983081817627 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.6785483360290527, + "epoch": 5.06, + "learning_rate": 2.7444350521273597e-05, + "loss": 1.4638, + "step": 5986, + "task_loss": 1.3667768239974976 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.669451117515564, + "epoch": 5.06, + "learning_rate": 2.7439654362731287e-05, + "loss": 1.8552, + "step": 5987, + "task_loss": 1.5857038497924805 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.5099369287490845, + "epoch": 5.06, + "learning_rate": 2.7434958204188977e-05, + "loss": 1.6199, + "step": 5988, + "task_loss": 1.3627605438232422 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.477861762046814, + "epoch": 5.06, + "learning_rate": 2.743026204564666e-05, + "loss": 1.6036, + "step": 5989, + "task_loss": 1.4046006202697754 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.1466853618621826, + "epoch": 5.06, + "learning_rate": 2.742556588710435e-05, + "loss": 1.4119, + "step": 5990, + "task_loss": 0.7919737100601196 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.1878602504730225, + "epoch": 5.06, + "learning_rate": 2.742086972856204e-05, + "loss": 1.5422, + "step": 5991, + "task_loss": 1.6900066137313843 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6206276416778564, + "epoch": 5.07, + "learning_rate": 2.7416173570019726e-05, + "loss": 1.4602, + "step": 5992, + "task_loss": 0.4422968327999115 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 2.222472667694092, + "epoch": 5.07, + "learning_rate": 2.7411477411477415e-05, + "loss": 1.6409, + "step": 5993, + "task_loss": 1.4876128435134888 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 2.1480159759521484, + "epoch": 5.07, + "learning_rate": 2.7406781252935098e-05, + "loss": 1.6129, + "step": 5994, + "task_loss": 1.3178582191467285 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.5628468990325928, + "epoch": 5.07, + "learning_rate": 2.7402085094392788e-05, + "loss": 1.988, + "step": 5995, + "task_loss": 1.6689242124557495 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.068885087966919, + "epoch": 5.07, + "learning_rate": 2.7397388935850478e-05, + "loss": 1.7041, + "step": 5996, + "task_loss": 0.7173780202865601 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.6105893850326538, + "epoch": 5.07, + "learning_rate": 2.7392692777308164e-05, + "loss": 1.5997, + "step": 5997, + "task_loss": 1.1925582885742188 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.4162261486053467, + "epoch": 5.07, + "learning_rate": 2.738799661876585e-05, + "loss": 1.5213, + "step": 5998, + "task_loss": 0.466021865606308 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.4770967960357666, + "epoch": 5.07, + "learning_rate": 2.7383300460223537e-05, + "loss": 1.5651, + "step": 5999, + "task_loss": 0.6623186469078064 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 2.3965227603912354, + "epoch": 5.07, + "learning_rate": 2.7378604301681226e-05, + "loss": 1.8848, + "step": 6000, + "task_loss": 1.2494666576385498 + }, + { + "epoch": 5.07, + "eval_accuracy": 0.8315247524752475, + "eval_loss": 0.9807378053665161, + "eval_runtime": 226.339, + "eval_samples_per_second": 111.558, + "eval_steps_per_second": 0.875, + "step": 6000 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.7662501335144043, + "epoch": 5.07, + "learning_rate": 2.7373908143138916e-05, + "loss": 1.3311, + "step": 6001, + "task_loss": 1.343724250793457 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.9522209167480469, + "epoch": 5.07, + "learning_rate": 2.73692119845966e-05, + "loss": 1.9406, + "step": 6002, + "task_loss": 2.0613434314727783 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.7874780893325806, + "epoch": 5.07, + "learning_rate": 2.736451582605429e-05, + "loss": 1.5376, + "step": 6003, + "task_loss": 1.2606388330459595 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.8563027381896973, + "epoch": 5.08, + "learning_rate": 2.7359819667511975e-05, + "loss": 1.4796, + "step": 6004, + "task_loss": 1.0875974893569946 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8217250108718872, + "epoch": 5.08, + "learning_rate": 2.7355123508969665e-05, + "loss": 1.3016, + "step": 6005, + "task_loss": 0.9278236627578735 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.8831559419631958, + "epoch": 5.08, + "learning_rate": 2.7350427350427355e-05, + "loss": 1.3962, + "step": 6006, + "task_loss": 1.3202184438705444 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.2185256481170654, + "epoch": 5.08, + "learning_rate": 2.7345731191885038e-05, + "loss": 1.3391, + "step": 6007, + "task_loss": 1.0860174894332886 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.48492521047592163, + "epoch": 5.08, + "learning_rate": 2.7341035033342727e-05, + "loss": 1.3083, + "step": 6008, + "task_loss": 1.0913361310958862 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 2.29097843170166, + "epoch": 5.08, + "learning_rate": 2.7336338874800417e-05, + "loss": 1.6612, + "step": 6009, + "task_loss": 1.0779439210891724 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.831801414489746, + "epoch": 5.08, + "learning_rate": 2.7331642716258103e-05, + "loss": 1.6818, + "step": 6010, + "task_loss": 1.0645065307617188 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 2.2680459022521973, + "epoch": 5.08, + "learning_rate": 2.7326946557715786e-05, + "loss": 1.9825, + "step": 6011, + "task_loss": 2.25335693359375 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.501272201538086, + "epoch": 5.08, + "learning_rate": 2.7322250399173476e-05, + "loss": 1.4031, + "step": 6012, + "task_loss": 1.037697434425354 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 2.006222724914551, + "epoch": 5.08, + "learning_rate": 2.7317554240631166e-05, + "loss": 1.7654, + "step": 6013, + "task_loss": 1.8549330234527588 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.5703160762786865, + "epoch": 5.08, + "learning_rate": 2.7312858082088855e-05, + "loss": 1.2716, + "step": 6014, + "task_loss": 0.6937095522880554 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.543492317199707, + "epoch": 5.08, + "learning_rate": 2.730816192354654e-05, + "loss": 1.3936, + "step": 6015, + "task_loss": 0.9498792886734009 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.4141132831573486, + "epoch": 5.09, + "learning_rate": 2.7303465765004228e-05, + "loss": 1.4084, + "step": 6016, + "task_loss": 2.1713035106658936 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 2.2215139865875244, + "epoch": 5.09, + "learning_rate": 2.7298769606461915e-05, + "loss": 2.0117, + "step": 6017, + "task_loss": 1.5173630714416504 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.6084805727005005, + "epoch": 5.09, + "learning_rate": 2.7294073447919604e-05, + "loss": 1.404, + "step": 6018, + "task_loss": 1.4337973594665527 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8146912455558777, + "epoch": 5.09, + "learning_rate": 2.7289377289377287e-05, + "loss": 1.2187, + "step": 6019, + "task_loss": 0.6366134881973267 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.15373694896698, + "epoch": 5.09, + "learning_rate": 2.7284681130834977e-05, + "loss": 1.3174, + "step": 6020, + "task_loss": 0.638940691947937 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.65285062789917, + "epoch": 5.09, + "learning_rate": 2.7279984972292667e-05, + "loss": 1.6437, + "step": 6021, + "task_loss": 1.9222254753112793 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.5105171203613281, + "epoch": 5.09, + "learning_rate": 2.7275288813750356e-05, + "loss": 1.578, + "step": 6022, + "task_loss": 1.066010594367981 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.2227613925933838, + "epoch": 5.09, + "learning_rate": 2.7270592655208043e-05, + "loss": 1.5835, + "step": 6023, + "task_loss": 1.7856481075286865 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9503490924835205, + "epoch": 5.09, + "learning_rate": 2.7265896496665726e-05, + "loss": 1.3087, + "step": 6024, + "task_loss": 0.6125936508178711 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.561194896697998, + "epoch": 5.09, + "learning_rate": 2.7261200338123415e-05, + "loss": 1.2933, + "step": 6025, + "task_loss": 2.2164900302886963 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.7554807662963867, + "epoch": 5.09, + "learning_rate": 2.7256504179581105e-05, + "loss": 1.5082, + "step": 6026, + "task_loss": 0.9767686724662781 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 2.0444374084472656, + "epoch": 5.09, + "learning_rate": 2.7251808021038795e-05, + "loss": 1.464, + "step": 6027, + "task_loss": 1.131515383720398 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.6854791641235352, + "epoch": 5.1, + "learning_rate": 2.7247111862496478e-05, + "loss": 1.471, + "step": 6028, + "task_loss": 2.2208142280578613 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.1964408159255981, + "epoch": 5.1, + "learning_rate": 2.7242415703954168e-05, + "loss": 1.4521, + "step": 6029, + "task_loss": 0.7715833783149719 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.1415436267852783, + "epoch": 5.1, + "learning_rate": 2.7237719545411854e-05, + "loss": 1.2644, + "step": 6030, + "task_loss": 1.5832778215408325 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8606854677200317, + "epoch": 5.1, + "learning_rate": 2.7233023386869544e-05, + "loss": 1.2504, + "step": 6031, + "task_loss": 0.8619833588600159 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.74061918258667, + "epoch": 5.1, + "learning_rate": 2.7228327228327227e-05, + "loss": 1.406, + "step": 6032, + "task_loss": 1.0596593618392944 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.9357960224151611, + "epoch": 5.1, + "learning_rate": 2.7223631069784916e-05, + "loss": 1.3556, + "step": 6033, + "task_loss": 0.7946279644966125 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.196561336517334, + "epoch": 5.1, + "learning_rate": 2.7218934911242606e-05, + "loss": 0.958, + "step": 6034, + "task_loss": 0.9397717714309692 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8972089290618896, + "epoch": 5.1, + "learning_rate": 2.7214238752700292e-05, + "loss": 1.4384, + "step": 6035, + "task_loss": 0.46294689178466797 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.6775273084640503, + "epoch": 5.1, + "learning_rate": 2.720954259415798e-05, + "loss": 1.4422, + "step": 6036, + "task_loss": 1.3346761465072632 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 2.0288169384002686, + "epoch": 5.1, + "learning_rate": 2.7204846435615665e-05, + "loss": 1.4545, + "step": 6037, + "task_loss": 1.0529216527938843 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.247133493423462, + "epoch": 5.1, + "learning_rate": 2.7200150277073355e-05, + "loss": 1.2885, + "step": 6038, + "task_loss": 1.5230854749679565 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8417154550552368, + "epoch": 5.1, + "learning_rate": 2.7195454118531044e-05, + "loss": 1.1793, + "step": 6039, + "task_loss": 1.413912296295166 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.3710811138153076, + "epoch": 5.11, + "learning_rate": 2.7190757959988734e-05, + "loss": 1.0497, + "step": 6040, + "task_loss": 1.7059674263000488 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.5831513404846191, + "epoch": 5.11, + "learning_rate": 2.7186061801446417e-05, + "loss": 1.4102, + "step": 6041, + "task_loss": 0.9912034273147583 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.6207759380340576, + "epoch": 5.11, + "learning_rate": 2.7181365642904107e-05, + "loss": 1.8767, + "step": 6042, + "task_loss": 1.3208168745040894 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.2722163200378418, + "epoch": 5.11, + "learning_rate": 2.7176669484361793e-05, + "loss": 1.4569, + "step": 6043, + "task_loss": 0.8121294975280762 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.565006971359253, + "epoch": 5.11, + "learning_rate": 2.7171973325819483e-05, + "loss": 1.7376, + "step": 6044, + "task_loss": 1.5227748155593872 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.8481736183166504, + "epoch": 5.11, + "learning_rate": 2.7167277167277166e-05, + "loss": 1.5595, + "step": 6045, + "task_loss": 0.9575451016426086 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.968367338180542, + "epoch": 5.11, + "learning_rate": 2.7162581008734856e-05, + "loss": 1.2139, + "step": 6046, + "task_loss": 1.9066754579544067 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.1301653385162354, + "epoch": 5.11, + "learning_rate": 2.7157884850192545e-05, + "loss": 1.1689, + "step": 6047, + "task_loss": 0.768576979637146 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0179550647735596, + "epoch": 5.11, + "learning_rate": 2.715318869165023e-05, + "loss": 1.2857, + "step": 6048, + "task_loss": 0.7878801226615906 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.3087983131408691, + "epoch": 5.11, + "learning_rate": 2.7148492533107918e-05, + "loss": 1.1058, + "step": 6049, + "task_loss": 0.9538347721099854 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0463907718658447, + "epoch": 5.11, + "learning_rate": 2.7143796374565604e-05, + "loss": 1.1571, + "step": 6050, + "task_loss": 0.9137436151504517 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.4125372171401978, + "epoch": 5.11, + "learning_rate": 2.7139100216023294e-05, + "loss": 1.4137, + "step": 6051, + "task_loss": 1.0317579507827759 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.9229708909988403, + "epoch": 5.12, + "learning_rate": 2.7134404057480984e-05, + "loss": 1.1499, + "step": 6052, + "task_loss": 1.275350570678711 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.331318974494934, + "epoch": 5.12, + "learning_rate": 2.7129707898938674e-05, + "loss": 1.1869, + "step": 6053, + "task_loss": 2.2413463592529297 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 2.223301410675049, + "epoch": 5.12, + "learning_rate": 2.7125011740396357e-05, + "loss": 1.2375, + "step": 6054, + "task_loss": 1.8653202056884766 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 2.038719892501831, + "epoch": 5.12, + "learning_rate": 2.7120315581854043e-05, + "loss": 1.4847, + "step": 6055, + "task_loss": 2.0970044136047363 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 2.0520880222320557, + "epoch": 5.12, + "learning_rate": 2.7115619423311733e-05, + "loss": 1.3479, + "step": 6056, + "task_loss": 0.7605481147766113 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 2.2198703289031982, + "epoch": 5.12, + "learning_rate": 2.7110923264769422e-05, + "loss": 1.5342, + "step": 6057, + "task_loss": 1.539783000946045 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.760793924331665, + "epoch": 5.12, + "learning_rate": 2.7106227106227105e-05, + "loss": 1.4329, + "step": 6058, + "task_loss": 1.3907667398452759 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0877609252929688, + "epoch": 5.12, + "learning_rate": 2.7101530947684795e-05, + "loss": 1.2575, + "step": 6059, + "task_loss": 1.697043538093567 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.1456019878387451, + "epoch": 5.12, + "learning_rate": 2.7096834789142485e-05, + "loss": 1.3013, + "step": 6060, + "task_loss": 0.39524057507514954 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7074329853057861, + "epoch": 5.12, + "learning_rate": 2.709213863060017e-05, + "loss": 1.1546, + "step": 6061, + "task_loss": 1.4160292148590088 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.3175756931304932, + "epoch": 5.12, + "learning_rate": 2.7087442472057854e-05, + "loss": 0.9868, + "step": 6062, + "task_loss": 0.5832731127738953 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.1958972215652466, + "epoch": 5.13, + "learning_rate": 2.7082746313515544e-05, + "loss": 1.248, + "step": 6063, + "task_loss": 0.5755500793457031 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.2761794328689575, + "epoch": 5.13, + "learning_rate": 2.7078050154973233e-05, + "loss": 1.1467, + "step": 6064, + "task_loss": 0.6066094040870667 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.9923672676086426, + "epoch": 5.13, + "learning_rate": 2.7073353996430923e-05, + "loss": 1.2313, + "step": 6065, + "task_loss": 1.2634145021438599 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.806491732597351, + "epoch": 5.13, + "learning_rate": 2.7068657837888606e-05, + "loss": 1.4147, + "step": 6066, + "task_loss": 1.430079460144043 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8822911977767944, + "epoch": 5.13, + "learning_rate": 2.7063961679346296e-05, + "loss": 1.422, + "step": 6067, + "task_loss": 1.0056214332580566 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.966623067855835, + "epoch": 5.13, + "learning_rate": 2.7059265520803982e-05, + "loss": 1.4988, + "step": 6068, + "task_loss": 1.0962300300598145 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.99144446849823, + "epoch": 5.13, + "learning_rate": 2.7054569362261672e-05, + "loss": 1.3089, + "step": 6069, + "task_loss": 0.6787312626838684 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.3439979553222656, + "epoch": 5.13, + "learning_rate": 2.704987320371936e-05, + "loss": 1.5178, + "step": 6070, + "task_loss": 1.4543005228042603 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.8243721723556519, + "epoch": 5.13, + "learning_rate": 2.7045177045177045e-05, + "loss": 1.6764, + "step": 6071, + "task_loss": 2.1928622722625732 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.2107899188995361, + "epoch": 5.13, + "learning_rate": 2.7040480886634734e-05, + "loss": 1.5046, + "step": 6072, + "task_loss": 1.5262925624847412 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7872982025146484, + "epoch": 5.13, + "learning_rate": 2.7035784728092424e-05, + "loss": 1.1615, + "step": 6073, + "task_loss": 0.08315528929233551 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0898466110229492, + "epoch": 5.13, + "learning_rate": 2.703108856955011e-05, + "loss": 1.4369, + "step": 6074, + "task_loss": 0.39144861698150635 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.5850605964660645, + "epoch": 5.14, + "learning_rate": 2.7026392411007793e-05, + "loss": 1.4368, + "step": 6075, + "task_loss": 1.384130835533142 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.100174903869629, + "epoch": 5.14, + "learning_rate": 2.7021696252465483e-05, + "loss": 1.1585, + "step": 6076, + "task_loss": 1.4277344942092896 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.9928566217422485, + "epoch": 5.14, + "learning_rate": 2.7017000093923173e-05, + "loss": 1.4928, + "step": 6077, + "task_loss": 2.2164745330810547 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.842015027999878, + "epoch": 5.14, + "learning_rate": 2.7012303935380863e-05, + "loss": 1.7829, + "step": 6078, + "task_loss": 1.5232429504394531 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9228453636169434, + "epoch": 5.14, + "learning_rate": 2.7007607776838545e-05, + "loss": 1.0075, + "step": 6079, + "task_loss": 0.9605417251586914 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9140515923500061, + "epoch": 5.14, + "learning_rate": 2.7002911618296235e-05, + "loss": 1.1614, + "step": 6080, + "task_loss": 1.0511114597320557 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.028151512145996, + "epoch": 5.14, + "learning_rate": 2.699821545975392e-05, + "loss": 1.5562, + "step": 6081, + "task_loss": 0.42041832208633423 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8639107942581177, + "epoch": 5.14, + "learning_rate": 2.699351930121161e-05, + "loss": 1.2336, + "step": 6082, + "task_loss": 0.6014939546585083 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9981849789619446, + "epoch": 5.14, + "learning_rate": 2.69888231426693e-05, + "loss": 1.1848, + "step": 6083, + "task_loss": 0.7643138766288757 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9919228553771973, + "epoch": 5.14, + "learning_rate": 2.6984126984126984e-05, + "loss": 1.1664, + "step": 6084, + "task_loss": 0.6952190399169922 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.5592262744903564, + "epoch": 5.14, + "learning_rate": 2.6979430825584674e-05, + "loss": 1.4078, + "step": 6085, + "task_loss": 1.4757106304168701 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.7650840282440186, + "epoch": 5.14, + "learning_rate": 2.6974734667042363e-05, + "loss": 1.6725, + "step": 6086, + "task_loss": 0.8313434720039368 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.9213629961013794, + "epoch": 5.15, + "learning_rate": 2.697003850850005e-05, + "loss": 1.5162, + "step": 6087, + "task_loss": 0.77763432264328 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.8686922788619995, + "epoch": 5.15, + "learning_rate": 2.6965342349957733e-05, + "loss": 1.5868, + "step": 6088, + "task_loss": 1.0385090112686157 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.2186870574951172, + "epoch": 5.15, + "learning_rate": 2.6960646191415422e-05, + "loss": 1.0529, + "step": 6089, + "task_loss": 0.7597925066947937 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.56624436378479, + "epoch": 5.15, + "learning_rate": 2.6955950032873112e-05, + "loss": 1.4093, + "step": 6090, + "task_loss": 1.1457046270370483 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.48423171043396, + "epoch": 5.15, + "learning_rate": 2.6951253874330802e-05, + "loss": 1.1017, + "step": 6091, + "task_loss": 1.1203498840332031 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.7660644054412842, + "epoch": 5.15, + "learning_rate": 2.6946557715788485e-05, + "loss": 1.2766, + "step": 6092, + "task_loss": 1.1163121461868286 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.3043910264968872, + "epoch": 5.15, + "learning_rate": 2.6941861557246175e-05, + "loss": 1.2075, + "step": 6093, + "task_loss": 1.3990834951400757 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.6476186513900757, + "epoch": 5.15, + "learning_rate": 2.693716539870386e-05, + "loss": 1.3961, + "step": 6094, + "task_loss": 1.104543924331665 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.882048487663269, + "epoch": 5.15, + "learning_rate": 2.693246924016155e-05, + "loss": 1.1462, + "step": 6095, + "task_loss": 1.5248403549194336 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.2639052867889404, + "epoch": 5.15, + "learning_rate": 2.6927773081619234e-05, + "loss": 1.5688, + "step": 6096, + "task_loss": 1.4257371425628662 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.3533936738967896, + "epoch": 5.15, + "learning_rate": 2.6923076923076923e-05, + "loss": 1.2572, + "step": 6097, + "task_loss": 1.4637911319732666 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8188688158988953, + "epoch": 5.15, + "learning_rate": 2.6918380764534613e-05, + "loss": 1.1332, + "step": 6098, + "task_loss": 1.5603359937667847 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.67162024974823, + "epoch": 5.16, + "learning_rate": 2.69136846059923e-05, + "loss": 1.251, + "step": 6099, + "task_loss": 1.4481875896453857 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.2423639297485352, + "epoch": 5.16, + "learning_rate": 2.690898844744999e-05, + "loss": 1.3194, + "step": 6100, + "task_loss": 0.5610933899879456 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.1068581342697144, + "epoch": 5.16, + "learning_rate": 2.6904292288907672e-05, + "loss": 1.1608, + "step": 6101, + "task_loss": 3.1303048133850098 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.431076169013977, + "epoch": 5.16, + "learning_rate": 2.6899596130365362e-05, + "loss": 1.128, + "step": 6102, + "task_loss": 0.6214473247528076 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.376178503036499, + "epoch": 5.16, + "learning_rate": 2.689489997182305e-05, + "loss": 1.2766, + "step": 6103, + "task_loss": 0.6475787162780762 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.2743505239486694, + "epoch": 5.16, + "learning_rate": 2.689020381328074e-05, + "loss": 1.1901, + "step": 6104, + "task_loss": 0.7211982607841492 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.5053305625915527, + "epoch": 5.16, + "learning_rate": 2.6885507654738424e-05, + "loss": 1.4493, + "step": 6105, + "task_loss": 0.6894542574882507 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.5276148319244385, + "epoch": 5.16, + "learning_rate": 2.688081149619611e-05, + "loss": 1.1566, + "step": 6106, + "task_loss": 1.2581615447998047 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0961766242980957, + "epoch": 5.16, + "learning_rate": 2.68761153376538e-05, + "loss": 1.5164, + "step": 6107, + "task_loss": 0.9878286123275757 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5335888862609863, + "epoch": 5.16, + "learning_rate": 2.687141917911149e-05, + "loss": 0.9308, + "step": 6108, + "task_loss": 0.4749242663383484 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.2188602685928345, + "epoch": 5.16, + "learning_rate": 2.6866723020569173e-05, + "loss": 1.1145, + "step": 6109, + "task_loss": 1.1711950302124023 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.4144906997680664, + "epoch": 5.16, + "learning_rate": 2.6862026862026863e-05, + "loss": 1.5315, + "step": 6110, + "task_loss": 1.0388909578323364 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.6925595998764038, + "epoch": 5.17, + "learning_rate": 2.6857330703484552e-05, + "loss": 1.5303, + "step": 6111, + "task_loss": 1.4400750398635864 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.8189839124679565, + "epoch": 5.17, + "learning_rate": 2.685263454494224e-05, + "loss": 1.1633, + "step": 6112, + "task_loss": 1.469436764717102 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.7965493202209473, + "epoch": 5.17, + "learning_rate": 2.6847938386399925e-05, + "loss": 1.2321, + "step": 6113, + "task_loss": 1.2704484462738037 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.5181885957717896, + "epoch": 5.17, + "learning_rate": 2.684324222785761e-05, + "loss": 1.3499, + "step": 6114, + "task_loss": 1.2870570421218872 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.4461888074874878, + "epoch": 5.17, + "learning_rate": 2.68385460693153e-05, + "loss": 1.3922, + "step": 6115, + "task_loss": 1.0139142274856567 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6951039433479309, + "epoch": 5.17, + "learning_rate": 2.683384991077299e-05, + "loss": 1.1581, + "step": 6116, + "task_loss": 0.37230202555656433 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.3888874053955078, + "epoch": 5.17, + "learning_rate": 2.682915375223068e-05, + "loss": 1.2628, + "step": 6117, + "task_loss": 0.3806201219558716 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.343388319015503, + "epoch": 5.17, + "learning_rate": 2.6824457593688364e-05, + "loss": 1.3597, + "step": 6118, + "task_loss": 1.174654483795166 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.825045108795166, + "epoch": 5.17, + "learning_rate": 2.681976143514605e-05, + "loss": 1.1432, + "step": 6119, + "task_loss": 1.1517502069473267 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0110048055648804, + "epoch": 5.17, + "learning_rate": 2.681506527660374e-05, + "loss": 1.1355, + "step": 6120, + "task_loss": 0.5486020445823669 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.7390542030334473, + "epoch": 5.17, + "learning_rate": 2.681036911806143e-05, + "loss": 1.4507, + "step": 6121, + "task_loss": 2.345353364944458 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8099324107170105, + "epoch": 5.17, + "learning_rate": 2.6805672959519112e-05, + "loss": 1.1678, + "step": 6122, + "task_loss": 0.7387697100639343 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.5662109851837158, + "epoch": 5.18, + "learning_rate": 2.6800976800976802e-05, + "loss": 1.416, + "step": 6123, + "task_loss": 1.1730748414993286 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.937831461429596, + "epoch": 5.18, + "learning_rate": 2.6796280642434492e-05, + "loss": 1.161, + "step": 6124, + "task_loss": 1.0569132566452026 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.4459607601165771, + "epoch": 5.18, + "learning_rate": 2.6791584483892178e-05, + "loss": 1.2346, + "step": 6125, + "task_loss": 1.0594303607940674 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.2699089050292969, + "epoch": 5.18, + "learning_rate": 2.678688832534986e-05, + "loss": 1.0665, + "step": 6126, + "task_loss": 0.6816018223762512 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.1862905025482178, + "epoch": 5.18, + "learning_rate": 2.678219216680755e-05, + "loss": 1.2084, + "step": 6127, + "task_loss": 0.269861102104187 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.633484959602356, + "epoch": 5.18, + "learning_rate": 2.677749600826524e-05, + "loss": 1.9365, + "step": 6128, + "task_loss": 0.8539460897445679 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5797567367553711, + "epoch": 5.18, + "learning_rate": 2.677279984972293e-05, + "loss": 1.0908, + "step": 6129, + "task_loss": 0.884101390838623 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.1826082468032837, + "epoch": 5.18, + "learning_rate": 2.6768103691180617e-05, + "loss": 1.1799, + "step": 6130, + "task_loss": 1.1301647424697876 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.1413825750350952, + "epoch": 5.18, + "learning_rate": 2.6763407532638303e-05, + "loss": 1.2428, + "step": 6131, + "task_loss": 1.1832886934280396 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9571319818496704, + "epoch": 5.18, + "learning_rate": 2.675871137409599e-05, + "loss": 1.109, + "step": 6132, + "task_loss": 1.0448585748672485 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.1077163219451904, + "epoch": 5.18, + "learning_rate": 2.675401521555368e-05, + "loss": 1.0242, + "step": 6133, + "task_loss": 0.5394262671470642 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.4249165058135986, + "epoch": 5.19, + "learning_rate": 2.674931905701137e-05, + "loss": 1.4015, + "step": 6134, + "task_loss": 2.117480754852295 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.258486032485962, + "epoch": 5.19, + "learning_rate": 2.674462289846905e-05, + "loss": 1.1072, + "step": 6135, + "task_loss": 0.5978756546974182 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7848345041275024, + "epoch": 5.19, + "learning_rate": 2.673992673992674e-05, + "loss": 0.7429, + "step": 6136, + "task_loss": 0.8130994439125061 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.338639497756958, + "epoch": 5.19, + "learning_rate": 2.673523058138443e-05, + "loss": 1.0678, + "step": 6137, + "task_loss": 1.8277900218963623 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.6257516145706177, + "epoch": 5.19, + "learning_rate": 2.6730534422842117e-05, + "loss": 1.0829, + "step": 6138, + "task_loss": 1.0870119333267212 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.579148292541504, + "epoch": 5.19, + "learning_rate": 2.67258382642998e-05, + "loss": 1.3776, + "step": 6139, + "task_loss": 0.5376133918762207 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.2613327503204346, + "epoch": 5.19, + "learning_rate": 2.672114210575749e-05, + "loss": 1.1508, + "step": 6140, + "task_loss": 0.6467357873916626 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 2.401784896850586, + "epoch": 5.19, + "learning_rate": 2.671644594721518e-05, + "loss": 1.3999, + "step": 6141, + "task_loss": 1.8623753786087036 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8652021288871765, + "epoch": 5.19, + "learning_rate": 2.671174978867287e-05, + "loss": 1.1823, + "step": 6142, + "task_loss": 0.4130382239818573 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7996633052825928, + "epoch": 5.19, + "learning_rate": 2.6707053630130553e-05, + "loss": 1.1557, + "step": 6143, + "task_loss": 0.7167235016822815 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.8029072284698486, + "epoch": 5.19, + "learning_rate": 2.6702357471588242e-05, + "loss": 1.1946, + "step": 6144, + "task_loss": 1.0476367473602295 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.1070213317871094, + "epoch": 5.19, + "learning_rate": 2.669766131304593e-05, + "loss": 1.1934, + "step": 6145, + "task_loss": 0.9667125940322876 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.5162668228149414, + "epoch": 5.2, + "learning_rate": 2.669296515450362e-05, + "loss": 1.284, + "step": 6146, + "task_loss": 1.616597056388855 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.201389193534851, + "epoch": 5.2, + "learning_rate": 2.6688268995961308e-05, + "loss": 1.0976, + "step": 6147, + "task_loss": 0.787985622882843 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7497252225875854, + "epoch": 5.2, + "learning_rate": 2.668357283741899e-05, + "loss": 0.9701, + "step": 6148, + "task_loss": 1.0974105596542358 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.155273675918579, + "epoch": 5.2, + "learning_rate": 2.667887667887668e-05, + "loss": 0.9958, + "step": 6149, + "task_loss": 0.6040500998497009 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.132684588432312, + "epoch": 5.2, + "learning_rate": 2.6674180520334367e-05, + "loss": 1.0203, + "step": 6150, + "task_loss": 0.9335824251174927 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6399801969528198, + "epoch": 5.2, + "learning_rate": 2.6669484361792057e-05, + "loss": 0.9643, + "step": 6151, + "task_loss": 0.6210339069366455 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.6683968305587769, + "epoch": 5.2, + "learning_rate": 2.666478820324974e-05, + "loss": 1.4203, + "step": 6152, + "task_loss": 0.7052663564682007 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.6595520973205566, + "epoch": 5.2, + "learning_rate": 2.666009204470743e-05, + "loss": 1.2996, + "step": 6153, + "task_loss": 2.007265329360962 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.6942312717437744, + "epoch": 5.2, + "learning_rate": 2.665539588616512e-05, + "loss": 1.3325, + "step": 6154, + "task_loss": 1.0949172973632812 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.6002906560897827, + "epoch": 5.2, + "learning_rate": 2.665069972762281e-05, + "loss": 1.0721, + "step": 6155, + "task_loss": 1.6846591234207153 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.3470604419708252, + "epoch": 5.2, + "learning_rate": 2.6646003569080492e-05, + "loss": 1.263, + "step": 6156, + "task_loss": 1.3509641885757446 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.2867796421051025, + "epoch": 5.2, + "learning_rate": 2.6641307410538178e-05, + "loss": 1.2219, + "step": 6157, + "task_loss": 1.1966526508331299 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.2978180646896362, + "epoch": 5.21, + "learning_rate": 2.6636611251995868e-05, + "loss": 1.2496, + "step": 6158, + "task_loss": 1.253466248512268 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.1556897163391113, + "epoch": 5.21, + "learning_rate": 2.6631915093453558e-05, + "loss": 1.2869, + "step": 6159, + "task_loss": 1.6497784852981567 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9617151021957397, + "epoch": 5.21, + "learning_rate": 2.6627218934911247e-05, + "loss": 1.0671, + "step": 6160, + "task_loss": 1.0358563661575317 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.1619236469268799, + "epoch": 5.21, + "learning_rate": 2.662252277636893e-05, + "loss": 1.4642, + "step": 6161, + "task_loss": 0.667813777923584 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.603804349899292, + "epoch": 5.21, + "learning_rate": 2.661782661782662e-05, + "loss": 1.2215, + "step": 6162, + "task_loss": 2.0797836780548096 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.4060077667236328, + "epoch": 5.21, + "learning_rate": 2.6613130459284306e-05, + "loss": 1.4375, + "step": 6163, + "task_loss": 1.161781668663025 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.009763240814209, + "epoch": 5.21, + "learning_rate": 2.6608434300741996e-05, + "loss": 0.9579, + "step": 6164, + "task_loss": 0.9952691793441772 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0181652307510376, + "epoch": 5.21, + "learning_rate": 2.660373814219968e-05, + "loss": 1.0573, + "step": 6165, + "task_loss": 1.2562041282653809 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.1134276390075684, + "epoch": 5.21, + "learning_rate": 2.659904198365737e-05, + "loss": 0.9777, + "step": 6166, + "task_loss": 1.0121058225631714 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0962872505187988, + "epoch": 5.21, + "learning_rate": 2.659434582511506e-05, + "loss": 1.3992, + "step": 6167, + "task_loss": 0.6822119951248169 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.163713812828064, + "epoch": 5.21, + "learning_rate": 2.658964966657275e-05, + "loss": 1.0107, + "step": 6168, + "task_loss": 1.0314366817474365 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7791461944580078, + "epoch": 5.21, + "learning_rate": 2.658495350803043e-05, + "loss": 1.2019, + "step": 6169, + "task_loss": 0.672261118888855 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7127944827079773, + "epoch": 5.22, + "learning_rate": 2.6580257349488118e-05, + "loss": 0.9899, + "step": 6170, + "task_loss": 0.8570666909217834 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.4738926887512207, + "epoch": 5.22, + "learning_rate": 2.6575561190945807e-05, + "loss": 1.2977, + "step": 6171, + "task_loss": 1.523746132850647 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8724461793899536, + "epoch": 5.22, + "learning_rate": 2.6570865032403497e-05, + "loss": 1.1183, + "step": 6172, + "task_loss": 0.9286195635795593 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0347788333892822, + "epoch": 5.22, + "learning_rate": 2.656616887386118e-05, + "loss": 1.3287, + "step": 6173, + "task_loss": 0.5948042273521423 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8637802600860596, + "epoch": 5.22, + "learning_rate": 2.656147271531887e-05, + "loss": 1.2724, + "step": 6174, + "task_loss": 1.007073998451233 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.1761953830718994, + "epoch": 5.22, + "learning_rate": 2.655677655677656e-05, + "loss": 1.0793, + "step": 6175, + "task_loss": 0.9497080445289612 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0064382553100586, + "epoch": 5.22, + "learning_rate": 2.6552080398234246e-05, + "loss": 1.1906, + "step": 6176, + "task_loss": 0.7407934069633484 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9420294761657715, + "epoch": 5.22, + "learning_rate": 2.6547384239691936e-05, + "loss": 1.1066, + "step": 6177, + "task_loss": 0.9494105577468872 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.763310432434082, + "epoch": 5.22, + "learning_rate": 2.654268808114962e-05, + "loss": 0.7693, + "step": 6178, + "task_loss": 1.0830469131469727 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.5182173252105713, + "epoch": 5.22, + "learning_rate": 2.6537991922607308e-05, + "loss": 1.2633, + "step": 6179, + "task_loss": 1.0931538343429565 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8743488192558289, + "epoch": 5.22, + "learning_rate": 2.6533295764064998e-05, + "loss": 1.0093, + "step": 6180, + "task_loss": 0.5811735391616821 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0481631755828857, + "epoch": 5.22, + "learning_rate": 2.6528599605522688e-05, + "loss": 1.3282, + "step": 6181, + "task_loss": 0.9845531582832336 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8869565725326538, + "epoch": 5.23, + "learning_rate": 2.652390344698037e-05, + "loss": 1.126, + "step": 6182, + "task_loss": 1.1474261283874512 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.761346459388733, + "epoch": 5.23, + "learning_rate": 2.6519207288438057e-05, + "loss": 1.2608, + "step": 6183, + "task_loss": 2.7264761924743652 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.5554115772247314, + "epoch": 5.23, + "learning_rate": 2.6514511129895747e-05, + "loss": 1.132, + "step": 6184, + "task_loss": 0.7846354246139526 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.6136316061019897, + "epoch": 5.23, + "learning_rate": 2.6509814971353436e-05, + "loss": 1.3493, + "step": 6185, + "task_loss": 1.4481470584869385 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.132309913635254, + "epoch": 5.23, + "learning_rate": 2.650511881281112e-05, + "loss": 1.1932, + "step": 6186, + "task_loss": 0.976943850517273 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.6539921760559082, + "epoch": 5.23, + "learning_rate": 2.650042265426881e-05, + "loss": 1.6107, + "step": 6187, + "task_loss": 1.3986767530441284 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.7405858039855957, + "epoch": 5.23, + "learning_rate": 2.64957264957265e-05, + "loss": 1.3736, + "step": 6188, + "task_loss": 1.4649220705032349 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0874395370483398, + "epoch": 5.23, + "learning_rate": 2.6491030337184185e-05, + "loss": 1.2465, + "step": 6189, + "task_loss": 1.2736620903015137 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.33677077293396, + "epoch": 5.23, + "learning_rate": 2.6486334178641868e-05, + "loss": 0.9569, + "step": 6190, + "task_loss": 0.3898025453090668 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.1996185779571533, + "epoch": 5.23, + "learning_rate": 2.6481638020099558e-05, + "loss": 1.1106, + "step": 6191, + "task_loss": 1.4132800102233887 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.5369480848312378, + "epoch": 5.23, + "learning_rate": 2.6476941861557248e-05, + "loss": 1.4528, + "step": 6192, + "task_loss": 1.032072901725769 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.3576630353927612, + "epoch": 5.23, + "learning_rate": 2.6472245703014937e-05, + "loss": 1.1274, + "step": 6193, + "task_loss": 1.2482482194900513 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.205902099609375, + "epoch": 5.24, + "learning_rate": 2.6467549544472624e-05, + "loss": 1.6272, + "step": 6194, + "task_loss": 0.7223795056343079 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.6535296440124512, + "epoch": 5.24, + "learning_rate": 2.646285338593031e-05, + "loss": 1.3122, + "step": 6195, + "task_loss": 1.471187949180603 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.2265735864639282, + "epoch": 5.24, + "learning_rate": 2.6458157227387996e-05, + "loss": 0.9638, + "step": 6196, + "task_loss": 1.6015468835830688 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.218266248703003, + "epoch": 5.24, + "learning_rate": 2.6453461068845686e-05, + "loss": 1.1289, + "step": 6197, + "task_loss": 0.9043564796447754 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 2.0610432624816895, + "epoch": 5.24, + "learning_rate": 2.6448764910303376e-05, + "loss": 1.2271, + "step": 6198, + "task_loss": 0.833638072013855 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8974314332008362, + "epoch": 5.24, + "learning_rate": 2.644406875176106e-05, + "loss": 1.0875, + "step": 6199, + "task_loss": 0.6530605554580688 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.851225733757019, + "epoch": 5.24, + "learning_rate": 2.643937259321875e-05, + "loss": 1.0794, + "step": 6200, + "task_loss": 0.8912054300308228 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.969151496887207, + "epoch": 5.24, + "learning_rate": 2.6434676434676435e-05, + "loss": 1.4437, + "step": 6201, + "task_loss": 0.7171374559402466 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.1624693870544434, + "epoch": 5.24, + "learning_rate": 2.6429980276134125e-05, + "loss": 1.0727, + "step": 6202, + "task_loss": 1.8397388458251953 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9696145057678223, + "epoch": 5.24, + "learning_rate": 2.6425284117591807e-05, + "loss": 1.1474, + "step": 6203, + "task_loss": 0.941592276096344 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6954988837242126, + "epoch": 5.24, + "learning_rate": 2.6420587959049497e-05, + "loss": 1.0299, + "step": 6204, + "task_loss": 0.6937874555587769 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4483092427253723, + "epoch": 5.24, + "learning_rate": 2.6415891800507187e-05, + "loss": 0.9922, + "step": 6205, + "task_loss": 0.3380734622478485 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.5732932090759277, + "epoch": 5.25, + "learning_rate": 2.6411195641964877e-05, + "loss": 1.312, + "step": 6206, + "task_loss": 1.6250627040863037 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.2279834747314453, + "epoch": 5.25, + "learning_rate": 2.6406499483422563e-05, + "loss": 1.3742, + "step": 6207, + "task_loss": 1.3332117795944214 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8850380182266235, + "epoch": 5.25, + "learning_rate": 2.640180332488025e-05, + "loss": 1.0146, + "step": 6208, + "task_loss": 0.8966948986053467 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8743408918380737, + "epoch": 5.25, + "learning_rate": 2.6397107166337936e-05, + "loss": 1.2203, + "step": 6209, + "task_loss": 0.6471081376075745 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.928680956363678, + "epoch": 5.25, + "learning_rate": 2.6392411007795625e-05, + "loss": 0.9443, + "step": 6210, + "task_loss": 1.6760292053222656 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.535560131072998, + "epoch": 5.25, + "learning_rate": 2.6387714849253315e-05, + "loss": 1.5162, + "step": 6211, + "task_loss": 0.9994226098060608 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 3.1606507301330566, + "epoch": 5.25, + "learning_rate": 2.6383018690710998e-05, + "loss": 1.6196, + "step": 6212, + "task_loss": 1.8813225030899048 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7509488463401794, + "epoch": 5.25, + "learning_rate": 2.6378322532168688e-05, + "loss": 1.1811, + "step": 6213, + "task_loss": 0.18218758702278137 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.429240107536316, + "epoch": 5.25, + "learning_rate": 2.6373626373626374e-05, + "loss": 1.0223, + "step": 6214, + "task_loss": 1.5722639560699463 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.2544280290603638, + "epoch": 5.25, + "learning_rate": 2.6368930215084064e-05, + "loss": 1.469, + "step": 6215, + "task_loss": 2.005526542663574 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.5714510679244995, + "epoch": 5.25, + "learning_rate": 2.6364234056541747e-05, + "loss": 1.2638, + "step": 6216, + "task_loss": 1.1345816850662231 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0270071029663086, + "epoch": 5.26, + "learning_rate": 2.6359537897999437e-05, + "loss": 1.1066, + "step": 6217, + "task_loss": 0.5646890997886658 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7078343033790588, + "epoch": 5.26, + "learning_rate": 2.6354841739457126e-05, + "loss": 1.1184, + "step": 6218, + "task_loss": 0.40160274505615234 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6631669998168945, + "epoch": 5.26, + "learning_rate": 2.6350145580914816e-05, + "loss": 1.0835, + "step": 6219, + "task_loss": 0.4133947193622589 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.4783446788787842, + "epoch": 5.26, + "learning_rate": 2.63454494223725e-05, + "loss": 1.3532, + "step": 6220, + "task_loss": 1.4772411584854126 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.4749538898468018, + "epoch": 5.26, + "learning_rate": 2.6340753263830185e-05, + "loss": 1.2705, + "step": 6221, + "task_loss": 0.6059532165527344 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.7464972734451294, + "epoch": 5.26, + "learning_rate": 2.6336057105287875e-05, + "loss": 1.2724, + "step": 6222, + "task_loss": 0.8676250576972961 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0208845138549805, + "epoch": 5.26, + "learning_rate": 2.6331360946745565e-05, + "loss": 0.9391, + "step": 6223, + "task_loss": 0.611379086971283 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.2742042541503906, + "epoch": 5.26, + "learning_rate": 2.6326664788203254e-05, + "loss": 1.3407, + "step": 6224, + "task_loss": 0.6895625591278076 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.5953598022460938, + "epoch": 5.26, + "learning_rate": 2.6321968629660937e-05, + "loss": 1.2846, + "step": 6225, + "task_loss": 1.449446201324463 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0208227634429932, + "epoch": 5.26, + "learning_rate": 2.6317272471118627e-05, + "loss": 1.034, + "step": 6226, + "task_loss": 0.30685529112815857 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5127379894256592, + "epoch": 5.26, + "learning_rate": 2.6312576312576314e-05, + "loss": 1.3577, + "step": 6227, + "task_loss": 0.4128589928150177 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.7353520393371582, + "epoch": 5.26, + "learning_rate": 2.6307880154034003e-05, + "loss": 1.2173, + "step": 6228, + "task_loss": 2.3256077766418457 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.039460301399231, + "epoch": 5.27, + "learning_rate": 2.6303183995491686e-05, + "loss": 1.1937, + "step": 6229, + "task_loss": 0.7884426712989807 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.445837140083313, + "epoch": 5.27, + "learning_rate": 2.6298487836949376e-05, + "loss": 1.6233, + "step": 6230, + "task_loss": 1.121649146080017 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.1477904319763184, + "epoch": 5.27, + "learning_rate": 2.6293791678407066e-05, + "loss": 1.0992, + "step": 6231, + "task_loss": 0.87971031665802 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6022188067436218, + "epoch": 5.27, + "learning_rate": 2.6289095519864755e-05, + "loss": 1.1678, + "step": 6232, + "task_loss": 0.31204754114151 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.944837212562561, + "epoch": 5.27, + "learning_rate": 2.628439936132244e-05, + "loss": 1.1334, + "step": 6233, + "task_loss": 1.4712275266647339 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.1013222932815552, + "epoch": 5.27, + "learning_rate": 2.6279703202780125e-05, + "loss": 1.2401, + "step": 6234, + "task_loss": 1.4451813697814941 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.082606554031372, + "epoch": 5.27, + "learning_rate": 2.6275007044237814e-05, + "loss": 1.2169, + "step": 6235, + "task_loss": 1.1276285648345947 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.7242954969406128, + "epoch": 5.27, + "learning_rate": 2.6270310885695504e-05, + "loss": 1.3732, + "step": 6236, + "task_loss": 1.0223360061645508 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.33125901222229, + "epoch": 5.27, + "learning_rate": 2.6265614727153194e-05, + "loss": 1.2937, + "step": 6237, + "task_loss": 0.883307158946991 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.3754355907440186, + "epoch": 5.27, + "learning_rate": 2.6260918568610877e-05, + "loss": 1.1066, + "step": 6238, + "task_loss": 1.233985185623169 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0425667762756348, + "epoch": 5.27, + "learning_rate": 2.6256222410068567e-05, + "loss": 0.8587, + "step": 6239, + "task_loss": 0.6798689961433411 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.1578224897384644, + "epoch": 5.27, + "learning_rate": 2.6251526251526253e-05, + "loss": 1.0135, + "step": 6240, + "task_loss": 1.5707179307937622 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.7140103578567505, + "epoch": 5.28, + "learning_rate": 2.6246830092983943e-05, + "loss": 1.0976, + "step": 6241, + "task_loss": 2.9521336555480957 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9706842303276062, + "epoch": 5.28, + "learning_rate": 2.6242133934441626e-05, + "loss": 0.9612, + "step": 6242, + "task_loss": 1.531694769859314 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0437289476394653, + "epoch": 5.28, + "learning_rate": 2.6237437775899315e-05, + "loss": 1.0858, + "step": 6243, + "task_loss": 1.9825717210769653 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7636774778366089, + "epoch": 5.28, + "learning_rate": 2.6232741617357005e-05, + "loss": 1.2448, + "step": 6244, + "task_loss": 0.8850448727607727 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.923209011554718, + "epoch": 5.28, + "learning_rate": 2.622804545881469e-05, + "loss": 1.2228, + "step": 6245, + "task_loss": 1.6893441677093506 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.2808351516723633, + "epoch": 5.28, + "learning_rate": 2.6223349300272378e-05, + "loss": 1.2283, + "step": 6246, + "task_loss": 1.5021425485610962 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0824205875396729, + "epoch": 5.28, + "learning_rate": 2.6218653141730064e-05, + "loss": 0.833, + "step": 6247, + "task_loss": 0.9099451899528503 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.6898624897003174, + "epoch": 5.28, + "learning_rate": 2.6213956983187754e-05, + "loss": 1.0686, + "step": 6248, + "task_loss": 0.8602155447006226 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0620982646942139, + "epoch": 5.28, + "learning_rate": 2.6209260824645443e-05, + "loss": 0.9658, + "step": 6249, + "task_loss": 1.9382007122039795 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.317171573638916, + "epoch": 5.28, + "learning_rate": 2.6204564666103126e-05, + "loss": 1.0456, + "step": 6250, + "task_loss": 1.481420874595642 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8809815049171448, + "epoch": 5.28, + "learning_rate": 2.6199868507560816e-05, + "loss": 1.1278, + "step": 6251, + "task_loss": 1.2659659385681152 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9375021457672119, + "epoch": 5.28, + "learning_rate": 2.6195172349018502e-05, + "loss": 1.2362, + "step": 6252, + "task_loss": 1.1262375116348267 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7712739109992981, + "epoch": 5.29, + "learning_rate": 2.6190476190476192e-05, + "loss": 0.8891, + "step": 6253, + "task_loss": 0.4744390845298767 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9102873802185059, + "epoch": 5.29, + "learning_rate": 2.6185780031933882e-05, + "loss": 1.0323, + "step": 6254, + "task_loss": 0.2178632766008377 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.329350233078003, + "epoch": 5.29, + "learning_rate": 2.6181083873391565e-05, + "loss": 1.1285, + "step": 6255, + "task_loss": 1.2384041547775269 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0683484077453613, + "epoch": 5.29, + "learning_rate": 2.6176387714849255e-05, + "loss": 0.9984, + "step": 6256, + "task_loss": 1.2279257774353027 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.657471239566803, + "epoch": 5.29, + "learning_rate": 2.6171691556306944e-05, + "loss": 0.9212, + "step": 6257, + "task_loss": 0.5140358805656433 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.202399730682373, + "epoch": 5.29, + "learning_rate": 2.616699539776463e-05, + "loss": 1.3406, + "step": 6258, + "task_loss": 1.6157007217407227 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7269793152809143, + "epoch": 5.29, + "learning_rate": 2.6162299239222317e-05, + "loss": 1.1243, + "step": 6259, + "task_loss": 0.5015912652015686 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7032637596130371, + "epoch": 5.29, + "learning_rate": 2.6157603080680003e-05, + "loss": 1.1415, + "step": 6260, + "task_loss": 1.0667850971221924 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.8244988918304443, + "epoch": 5.29, + "learning_rate": 2.6152906922137693e-05, + "loss": 1.1015, + "step": 6261, + "task_loss": 1.6181848049163818 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.7789819240570068, + "epoch": 5.29, + "learning_rate": 2.6148210763595383e-05, + "loss": 1.1612, + "step": 6262, + "task_loss": 1.3308385610580444 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.1467335224151611, + "epoch": 5.29, + "learning_rate": 2.6143514605053066e-05, + "loss": 1.115, + "step": 6263, + "task_loss": 0.5159995555877686 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.1943199634552002, + "epoch": 5.29, + "learning_rate": 2.6138818446510756e-05, + "loss": 1.1475, + "step": 6264, + "task_loss": 1.0690507888793945 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.1759699583053589, + "epoch": 5.3, + "learning_rate": 2.6134122287968442e-05, + "loss": 1.2338, + "step": 6265, + "task_loss": 0.7229683995246887 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8486177921295166, + "epoch": 5.3, + "learning_rate": 2.612942612942613e-05, + "loss": 0.9601, + "step": 6266, + "task_loss": 0.8885012269020081 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8676465749740601, + "epoch": 5.3, + "learning_rate": 2.6124729970883815e-05, + "loss": 1.0845, + "step": 6267, + "task_loss": 0.6014307141304016 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.3205220699310303, + "epoch": 5.3, + "learning_rate": 2.6120033812341504e-05, + "loss": 1.1579, + "step": 6268, + "task_loss": 1.1648128032684326 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.163299322128296, + "epoch": 5.3, + "learning_rate": 2.6115337653799194e-05, + "loss": 1.0619, + "step": 6269, + "task_loss": 1.0477020740509033 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.2350797653198242, + "epoch": 5.3, + "learning_rate": 2.6110641495256884e-05, + "loss": 1.1997, + "step": 6270, + "task_loss": 0.47865501046180725 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.043689489364624, + "epoch": 5.3, + "learning_rate": 2.610594533671457e-05, + "loss": 1.2971, + "step": 6271, + "task_loss": 1.0475184917449951 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.2485613822937012, + "epoch": 5.3, + "learning_rate": 2.6101249178172253e-05, + "loss": 1.2204, + "step": 6272, + "task_loss": 1.0651555061340332 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8915637731552124, + "epoch": 5.3, + "learning_rate": 2.6096553019629943e-05, + "loss": 1.065, + "step": 6273, + "task_loss": 0.740326464176178 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7085151672363281, + "epoch": 5.3, + "learning_rate": 2.6091856861087632e-05, + "loss": 0.7562, + "step": 6274, + "task_loss": 0.9056514501571655 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.2573350667953491, + "epoch": 5.3, + "learning_rate": 2.6087160702545322e-05, + "loss": 1.1908, + "step": 6275, + "task_loss": 0.3315636217594147 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.064650535583496, + "epoch": 5.3, + "learning_rate": 2.6082464544003005e-05, + "loss": 0.9445, + "step": 6276, + "task_loss": 0.601737380027771 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.6807420253753662, + "epoch": 5.31, + "learning_rate": 2.6077768385460695e-05, + "loss": 1.2413, + "step": 6277, + "task_loss": 1.6614559888839722 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8984200358390808, + "epoch": 5.31, + "learning_rate": 2.607307222691838e-05, + "loss": 1.0405, + "step": 6278, + "task_loss": 0.4443584382534027 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7277846932411194, + "epoch": 5.31, + "learning_rate": 2.606837606837607e-05, + "loss": 0.8638, + "step": 6279, + "task_loss": 0.7855465412139893 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.2097464799880981, + "epoch": 5.31, + "learning_rate": 2.6063679909833754e-05, + "loss": 1.0682, + "step": 6280, + "task_loss": 1.6008341312408447 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.4900636672973633, + "epoch": 5.31, + "learning_rate": 2.6058983751291444e-05, + "loss": 1.0517, + "step": 6281, + "task_loss": 1.4388266801834106 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4992321729660034, + "epoch": 5.31, + "learning_rate": 2.6054287592749133e-05, + "loss": 1.1086, + "step": 6282, + "task_loss": 0.4883953630924225 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7810841798782349, + "epoch": 5.31, + "learning_rate": 2.6049591434206823e-05, + "loss": 0.8893, + "step": 6283, + "task_loss": 0.8573915958404541 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.777472972869873, + "epoch": 5.31, + "learning_rate": 2.604489527566451e-05, + "loss": 1.0008, + "step": 6284, + "task_loss": 1.1877861022949219 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.9558157920837402, + "epoch": 5.31, + "learning_rate": 2.6040199117122192e-05, + "loss": 1.2275, + "step": 6285, + "task_loss": 2.007687568664551 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8540509939193726, + "epoch": 5.31, + "learning_rate": 2.6035502958579882e-05, + "loss": 1.1215, + "step": 6286, + "task_loss": 0.7785129547119141 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.2505308389663696, + "epoch": 5.31, + "learning_rate": 2.6030806800037572e-05, + "loss": 1.0324, + "step": 6287, + "task_loss": 0.9966123104095459 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.4795042276382446, + "epoch": 5.32, + "learning_rate": 2.602611064149526e-05, + "loss": 1.2354, + "step": 6288, + "task_loss": 1.2905465364456177 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5502089262008667, + "epoch": 5.32, + "learning_rate": 2.6021414482952944e-05, + "loss": 0.8555, + "step": 6289, + "task_loss": 0.7582305669784546 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.451699137687683, + "epoch": 5.32, + "learning_rate": 2.6016718324410634e-05, + "loss": 1.2322, + "step": 6290, + "task_loss": 1.3822592496871948 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.121490478515625, + "epoch": 5.32, + "learning_rate": 2.601202216586832e-05, + "loss": 1.1477, + "step": 6291, + "task_loss": 0.813787579536438 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9492838382720947, + "epoch": 5.32, + "learning_rate": 2.600732600732601e-05, + "loss": 0.8821, + "step": 6292, + "task_loss": 1.889599084854126 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.1156110763549805, + "epoch": 5.32, + "learning_rate": 2.6002629848783693e-05, + "loss": 0.8706, + "step": 6293, + "task_loss": 1.081559419631958 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.3446491956710815, + "epoch": 5.32, + "learning_rate": 2.5997933690241383e-05, + "loss": 1.2839, + "step": 6294, + "task_loss": 0.49927690625190735 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.3964911699295044, + "epoch": 5.32, + "learning_rate": 2.5993237531699073e-05, + "loss": 1.1423, + "step": 6295, + "task_loss": 0.885152280330658 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.2084403038024902, + "epoch": 5.32, + "learning_rate": 2.598854137315676e-05, + "loss": 1.2049, + "step": 6296, + "task_loss": 1.9564460515975952 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0270271301269531, + "epoch": 5.32, + "learning_rate": 2.5983845214614445e-05, + "loss": 1.0406, + "step": 6297, + "task_loss": 0.771664559841156 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8043367862701416, + "epoch": 5.32, + "learning_rate": 2.5979149056072132e-05, + "loss": 1.2423, + "step": 6298, + "task_loss": 0.32122018933296204 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 2.3620638847351074, + "epoch": 5.32, + "learning_rate": 2.597445289752982e-05, + "loss": 1.5326, + "step": 6299, + "task_loss": 2.170806884765625 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.071317195892334, + "epoch": 5.33, + "learning_rate": 2.596975673898751e-05, + "loss": 1.1026, + "step": 6300, + "task_loss": 0.9808135628700256 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0573582649230957, + "epoch": 5.33, + "learning_rate": 2.59650605804452e-05, + "loss": 1.0526, + "step": 6301, + "task_loss": 2.1081016063690186 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.929741382598877, + "epoch": 5.33, + "learning_rate": 2.5960364421902884e-05, + "loss": 1.2154, + "step": 6302, + "task_loss": 1.0205265283584595 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9894193410873413, + "epoch": 5.33, + "learning_rate": 2.5955668263360574e-05, + "loss": 0.9829, + "step": 6303, + "task_loss": 1.2885124683380127 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.090564250946045, + "epoch": 5.33, + "learning_rate": 2.595097210481826e-05, + "loss": 1.1781, + "step": 6304, + "task_loss": 1.379763126373291 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.3314253091812134, + "epoch": 5.33, + "learning_rate": 2.594627594627595e-05, + "loss": 1.1251, + "step": 6305, + "task_loss": 1.8114311695098877 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.8091065883636475, + "epoch": 5.33, + "learning_rate": 2.5941579787733633e-05, + "loss": 1.132, + "step": 6306, + "task_loss": 1.0272351503372192 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9011757373809814, + "epoch": 5.33, + "learning_rate": 2.5936883629191322e-05, + "loss": 1.0107, + "step": 6307, + "task_loss": 0.8120344877243042 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.2125520706176758, + "epoch": 5.33, + "learning_rate": 2.5932187470649012e-05, + "loss": 1.2476, + "step": 6308, + "task_loss": 0.8350540399551392 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.2465773820877075, + "epoch": 5.33, + "learning_rate": 2.59274913121067e-05, + "loss": 1.2381, + "step": 6309, + "task_loss": 0.7249597907066345 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4670814275741577, + "epoch": 5.33, + "learning_rate": 2.5922795153564385e-05, + "loss": 1.2253, + "step": 6310, + "task_loss": 0.03881332278251648 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.893643856048584, + "epoch": 5.33, + "learning_rate": 2.591809899502207e-05, + "loss": 1.3572, + "step": 6311, + "task_loss": 0.8605049252510071 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6161984801292419, + "epoch": 5.34, + "learning_rate": 2.591340283647976e-05, + "loss": 0.7624, + "step": 6312, + "task_loss": 0.1707698553800583 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 2.6236913204193115, + "epoch": 5.34, + "learning_rate": 2.590870667793745e-05, + "loss": 1.363, + "step": 6313, + "task_loss": 2.073456287384033 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.2565042972564697, + "epoch": 5.34, + "learning_rate": 2.590401051939514e-05, + "loss": 0.9292, + "step": 6314, + "task_loss": 0.464331716299057 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8568236827850342, + "epoch": 5.34, + "learning_rate": 2.5899314360852823e-05, + "loss": 0.917, + "step": 6315, + "task_loss": 1.047317624092102 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9772505164146423, + "epoch": 5.34, + "learning_rate": 2.589461820231051e-05, + "loss": 0.9018, + "step": 6316, + "task_loss": 0.35335761308670044 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 2.4276371002197266, + "epoch": 5.34, + "learning_rate": 2.58899220437682e-05, + "loss": 1.3128, + "step": 6317, + "task_loss": 2.0015709400177 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8495938181877136, + "epoch": 5.34, + "learning_rate": 2.588522588522589e-05, + "loss": 0.9544, + "step": 6318, + "task_loss": 1.1158232688903809 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.2351930141448975, + "epoch": 5.34, + "learning_rate": 2.5880529726683572e-05, + "loss": 1.1339, + "step": 6319, + "task_loss": 1.9154291152954102 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8957180976867676, + "epoch": 5.34, + "learning_rate": 2.587583356814126e-05, + "loss": 0.8173, + "step": 6320, + "task_loss": 1.326109766960144 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.6073119640350342, + "epoch": 5.34, + "learning_rate": 2.587113740959895e-05, + "loss": 1.0063, + "step": 6321, + "task_loss": 1.1907532215118408 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7874072790145874, + "epoch": 5.34, + "learning_rate": 2.5866441251056638e-05, + "loss": 0.9163, + "step": 6322, + "task_loss": 0.5948822498321533 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0012092590332031, + "epoch": 5.34, + "learning_rate": 2.586174509251432e-05, + "loss": 0.8907, + "step": 6323, + "task_loss": 0.5844492316246033 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7369522452354431, + "epoch": 5.35, + "learning_rate": 2.585704893397201e-05, + "loss": 0.8512, + "step": 6324, + "task_loss": 1.0217374563217163 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.5832953453063965, + "epoch": 5.35, + "learning_rate": 2.58523527754297e-05, + "loss": 1.066, + "step": 6325, + "task_loss": 1.427672266960144 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9957594275474548, + "epoch": 5.35, + "learning_rate": 2.584765661688739e-05, + "loss": 0.8375, + "step": 6326, + "task_loss": 0.669663667678833 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.1842681169509888, + "epoch": 5.35, + "learning_rate": 2.5842960458345073e-05, + "loss": 0.9617, + "step": 6327, + "task_loss": 1.2276474237442017 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0594830513000488, + "epoch": 5.35, + "learning_rate": 2.5838264299802763e-05, + "loss": 1.1535, + "step": 6328, + "task_loss": 0.8553057909011841 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0487747192382812, + "epoch": 5.35, + "learning_rate": 2.583356814126045e-05, + "loss": 1.093, + "step": 6329, + "task_loss": 1.2517271041870117 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0591256618499756, + "epoch": 5.35, + "learning_rate": 2.582887198271814e-05, + "loss": 1.0196, + "step": 6330, + "task_loss": 1.2961130142211914 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9135128259658813, + "epoch": 5.35, + "learning_rate": 2.582417582417583e-05, + "loss": 1.1901, + "step": 6331, + "task_loss": 1.2081958055496216 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0386688709259033, + "epoch": 5.35, + "learning_rate": 2.581947966563351e-05, + "loss": 0.8925, + "step": 6332, + "task_loss": 1.1858305931091309 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9114164710044861, + "epoch": 5.35, + "learning_rate": 2.58147835070912e-05, + "loss": 0.8306, + "step": 6333, + "task_loss": 0.5890434980392456 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6430786848068237, + "epoch": 5.35, + "learning_rate": 2.581008734854889e-05, + "loss": 0.9828, + "step": 6334, + "task_loss": 0.5364454388618469 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.4051508903503418, + "epoch": 5.35, + "learning_rate": 2.5805391190006577e-05, + "loss": 1.0941, + "step": 6335, + "task_loss": 1.1373642683029175 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.2859364748001099, + "epoch": 5.36, + "learning_rate": 2.580069503146426e-05, + "loss": 1.2581, + "step": 6336, + "task_loss": 0.8994808793067932 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.506266713142395, + "epoch": 5.36, + "learning_rate": 2.579599887292195e-05, + "loss": 1.2173, + "step": 6337, + "task_loss": 0.8273641467094421 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6704078316688538, + "epoch": 5.36, + "learning_rate": 2.579130271437964e-05, + "loss": 0.9506, + "step": 6338, + "task_loss": 0.5349453687667847 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.2382078170776367, + "epoch": 5.36, + "learning_rate": 2.578660655583733e-05, + "loss": 0.9746, + "step": 6339, + "task_loss": 1.844632625579834 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.354675054550171, + "epoch": 5.36, + "learning_rate": 2.5781910397295012e-05, + "loss": 1.1253, + "step": 6340, + "task_loss": 0.5944096446037292 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9370590448379517, + "epoch": 5.36, + "learning_rate": 2.5777214238752702e-05, + "loss": 0.7803, + "step": 6341, + "task_loss": 1.3131383657455444 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9885606169700623, + "epoch": 5.36, + "learning_rate": 2.5772518080210388e-05, + "loss": 1.1081, + "step": 6342, + "task_loss": 0.886330783367157 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5954943895339966, + "epoch": 5.36, + "learning_rate": 2.5767821921668078e-05, + "loss": 0.8166, + "step": 6343, + "task_loss": 0.5659304857254028 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7303256988525391, + "epoch": 5.36, + "learning_rate": 2.576312576312576e-05, + "loss": 1.0771, + "step": 6344, + "task_loss": 1.3235678672790527 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.1352696418762207, + "epoch": 5.36, + "learning_rate": 2.575842960458345e-05, + "loss": 1.1772, + "step": 6345, + "task_loss": 0.709708571434021 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6683835983276367, + "epoch": 5.36, + "learning_rate": 2.575373344604114e-05, + "loss": 0.796, + "step": 6346, + "task_loss": 0.7141897082328796 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.4479426145553589, + "epoch": 5.36, + "learning_rate": 2.5749037287498827e-05, + "loss": 1.202, + "step": 6347, + "task_loss": 1.5755990743637085 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.9838846921920776, + "epoch": 5.37, + "learning_rate": 2.5744341128956516e-05, + "loss": 1.3168, + "step": 6348, + "task_loss": 1.615256428718567 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.03053879737854, + "epoch": 5.37, + "learning_rate": 2.57396449704142e-05, + "loss": 1.0314, + "step": 6349, + "task_loss": 0.6874434351921082 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 2.283419609069824, + "epoch": 5.37, + "learning_rate": 2.573494881187189e-05, + "loss": 1.567, + "step": 6350, + "task_loss": 2.063002824783325 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8604526519775391, + "epoch": 5.37, + "learning_rate": 2.573025265332958e-05, + "loss": 0.9037, + "step": 6351, + "task_loss": 0.9422600269317627 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.3392837047576904, + "epoch": 5.37, + "learning_rate": 2.572555649478727e-05, + "loss": 1.0315, + "step": 6352, + "task_loss": 1.213079810142517 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.4964888095855713, + "epoch": 5.37, + "learning_rate": 2.572086033624495e-05, + "loss": 0.9462, + "step": 6353, + "task_loss": 1.4518071413040161 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5437741279602051, + "epoch": 5.37, + "learning_rate": 2.571616417770264e-05, + "loss": 1.0782, + "step": 6354, + "task_loss": 0.9972289800643921 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.3377244472503662, + "epoch": 5.37, + "learning_rate": 2.5711468019160328e-05, + "loss": 1.2693, + "step": 6355, + "task_loss": 1.8939356803894043 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9605881571769714, + "epoch": 5.37, + "learning_rate": 2.5706771860618017e-05, + "loss": 0.9315, + "step": 6356, + "task_loss": 0.6263020634651184 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9925759434700012, + "epoch": 5.37, + "learning_rate": 2.57020757020757e-05, + "loss": 0.9375, + "step": 6357, + "task_loss": 0.5074746012687683 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.2417348623275757, + "epoch": 5.37, + "learning_rate": 2.569737954353339e-05, + "loss": 1.2393, + "step": 6358, + "task_loss": 0.947325587272644 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.5062440633773804, + "epoch": 5.38, + "learning_rate": 2.569268338499108e-05, + "loss": 0.9885, + "step": 6359, + "task_loss": 1.7746906280517578 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0098721981048584, + "epoch": 5.38, + "learning_rate": 2.5687987226448766e-05, + "loss": 1.1323, + "step": 6360, + "task_loss": 2.448911190032959 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.3488571643829346, + "epoch": 5.38, + "learning_rate": 2.5683291067906456e-05, + "loss": 1.068, + "step": 6361, + "task_loss": 1.1848984956741333 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.2297766208648682, + "epoch": 5.38, + "learning_rate": 2.567859490936414e-05, + "loss": 1.0782, + "step": 6362, + "task_loss": 1.1673896312713623 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7162044644355774, + "epoch": 5.38, + "learning_rate": 2.567389875082183e-05, + "loss": 0.939, + "step": 6363, + "task_loss": 0.730419933795929 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8928181529045105, + "epoch": 5.38, + "learning_rate": 2.5669202592279518e-05, + "loss": 1.2395, + "step": 6364, + "task_loss": 0.8238903284072876 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8681780695915222, + "epoch": 5.38, + "learning_rate": 2.5664506433737208e-05, + "loss": 1.0558, + "step": 6365, + "task_loss": 0.6736039519309998 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8249871134757996, + "epoch": 5.38, + "learning_rate": 2.565981027519489e-05, + "loss": 1.0272, + "step": 6366, + "task_loss": 0.7399210333824158 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.1848400831222534, + "epoch": 5.38, + "learning_rate": 2.5655114116652577e-05, + "loss": 1.08, + "step": 6367, + "task_loss": 1.5130903720855713 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7539374232292175, + "epoch": 5.38, + "learning_rate": 2.5650417958110267e-05, + "loss": 1.0892, + "step": 6368, + "task_loss": 0.2544321119785309 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.1642818450927734, + "epoch": 5.38, + "learning_rate": 2.5645721799567957e-05, + "loss": 1.0631, + "step": 6369, + "task_loss": 1.002232551574707 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8273664712905884, + "epoch": 5.38, + "learning_rate": 2.564102564102564e-05, + "loss": 0.8958, + "step": 6370, + "task_loss": 0.3885570168495178 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.3125488758087158, + "epoch": 5.39, + "learning_rate": 2.563632948248333e-05, + "loss": 1.0597, + "step": 6371, + "task_loss": 1.3921027183532715 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8883625864982605, + "epoch": 5.39, + "learning_rate": 2.563163332394102e-05, + "loss": 1.0651, + "step": 6372, + "task_loss": 0.6870517134666443 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7650696039199829, + "epoch": 5.39, + "learning_rate": 2.5626937165398705e-05, + "loss": 0.7068, + "step": 6373, + "task_loss": 0.3232174217700958 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.5412269830703735, + "epoch": 5.39, + "learning_rate": 2.562224100685639e-05, + "loss": 1.3732, + "step": 6374, + "task_loss": 1.5598081350326538 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8687403798103333, + "epoch": 5.39, + "learning_rate": 2.5617544848314078e-05, + "loss": 1.1884, + "step": 6375, + "task_loss": 0.5333778262138367 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9229332208633423, + "epoch": 5.39, + "learning_rate": 2.5612848689771768e-05, + "loss": 0.9433, + "step": 6376, + "task_loss": 0.9237564206123352 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8227337002754211, + "epoch": 5.39, + "learning_rate": 2.5608152531229458e-05, + "loss": 1.0356, + "step": 6377, + "task_loss": 0.581084668636322 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7740417718887329, + "epoch": 5.39, + "learning_rate": 2.5603456372687147e-05, + "loss": 0.9609, + "step": 6378, + "task_loss": 0.7768348455429077 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.7141458988189697, + "epoch": 5.39, + "learning_rate": 2.559876021414483e-05, + "loss": 1.1227, + "step": 6379, + "task_loss": 1.545905351638794 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.1679857969284058, + "epoch": 5.39, + "learning_rate": 2.5594064055602517e-05, + "loss": 1.1004, + "step": 6380, + "task_loss": 1.456630825996399 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.066922664642334, + "epoch": 5.39, + "learning_rate": 2.5589367897060206e-05, + "loss": 0.9414, + "step": 6381, + "task_loss": 1.155923843383789 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.3232605457305908, + "epoch": 5.39, + "learning_rate": 2.5584671738517896e-05, + "loss": 1.1557, + "step": 6382, + "task_loss": 0.5759899020195007 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.7464649677276611, + "epoch": 5.4, + "learning_rate": 2.557997557997558e-05, + "loss": 1.0929, + "step": 6383, + "task_loss": 1.3400397300720215 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6620190143585205, + "epoch": 5.4, + "learning_rate": 2.557527942143327e-05, + "loss": 0.9794, + "step": 6384, + "task_loss": 1.5556213855743408 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8300684690475464, + "epoch": 5.4, + "learning_rate": 2.557058326289096e-05, + "loss": 1.051, + "step": 6385, + "task_loss": 1.3163827657699585 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0519795417785645, + "epoch": 5.4, + "learning_rate": 2.5565887104348645e-05, + "loss": 0.9844, + "step": 6386, + "task_loss": 1.060746431350708 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8546610474586487, + "epoch": 5.4, + "learning_rate": 2.5561190945806328e-05, + "loss": 1.3334, + "step": 6387, + "task_loss": 0.619842529296875 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.1876801252365112, + "epoch": 5.4, + "learning_rate": 2.5556494787264017e-05, + "loss": 1.092, + "step": 6388, + "task_loss": 0.8291508555412292 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3396753966808319, + "epoch": 5.4, + "learning_rate": 2.5551798628721707e-05, + "loss": 0.6644, + "step": 6389, + "task_loss": 0.029675286263227463 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0201027393341064, + "epoch": 5.4, + "learning_rate": 2.5547102470179397e-05, + "loss": 1.0454, + "step": 6390, + "task_loss": 1.4074045419692993 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.4152026176452637, + "epoch": 5.4, + "learning_rate": 2.5542406311637083e-05, + "loss": 1.1861, + "step": 6391, + "task_loss": 1.4105784893035889 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0058245658874512, + "epoch": 5.4, + "learning_rate": 2.553771015309477e-05, + "loss": 0.9379, + "step": 6392, + "task_loss": 0.800417423248291 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9238878488540649, + "epoch": 5.4, + "learning_rate": 2.5533013994552456e-05, + "loss": 1.1094, + "step": 6393, + "task_loss": 0.30619028210639954 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.3001458644866943, + "epoch": 5.4, + "learning_rate": 2.5528317836010146e-05, + "loss": 1.2588, + "step": 6394, + "task_loss": 0.840109646320343 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0764744281768799, + "epoch": 5.41, + "learning_rate": 2.5523621677467835e-05, + "loss": 0.9288, + "step": 6395, + "task_loss": 1.2149934768676758 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6389381885528564, + "epoch": 5.41, + "learning_rate": 2.551892551892552e-05, + "loss": 0.8799, + "step": 6396, + "task_loss": 0.4354816675186157 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0098598003387451, + "epoch": 5.41, + "learning_rate": 2.5514229360383208e-05, + "loss": 1.2358, + "step": 6397, + "task_loss": 1.1145069599151611 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0959441661834717, + "epoch": 5.41, + "learning_rate": 2.5509533201840898e-05, + "loss": 1.1208, + "step": 6398, + "task_loss": 0.48302799463272095 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.240706443786621, + "epoch": 5.41, + "learning_rate": 2.5504837043298584e-05, + "loss": 1.3133, + "step": 6399, + "task_loss": 1.481557011604309 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9447590708732605, + "epoch": 5.41, + "learning_rate": 2.5500140884756267e-05, + "loss": 1.0729, + "step": 6400, + "task_loss": 1.2816162109375 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.49677690863609314, + "epoch": 5.41, + "learning_rate": 2.5495444726213957e-05, + "loss": 0.8362, + "step": 6401, + "task_loss": 0.8680558800697327 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.7769114971160889, + "epoch": 5.41, + "learning_rate": 2.5490748567671647e-05, + "loss": 1.2789, + "step": 6402, + "task_loss": 1.5310560464859009 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8306081295013428, + "epoch": 5.41, + "learning_rate": 2.5486052409129336e-05, + "loss": 0.8416, + "step": 6403, + "task_loss": 1.3402783870697021 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0188164710998535, + "epoch": 5.41, + "learning_rate": 2.548135625058702e-05, + "loss": 0.9534, + "step": 6404, + "task_loss": 0.9778039455413818 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6441875696182251, + "epoch": 5.41, + "learning_rate": 2.547666009204471e-05, + "loss": 0.9517, + "step": 6405, + "task_loss": 0.8688264489173889 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.087812066078186, + "epoch": 5.41, + "learning_rate": 2.5471963933502395e-05, + "loss": 0.885, + "step": 6406, + "task_loss": 1.3358337879180908 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7328872680664062, + "epoch": 5.42, + "learning_rate": 2.5467267774960085e-05, + "loss": 0.7827, + "step": 6407, + "task_loss": 0.5390071868896484 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6655393242835999, + "epoch": 5.42, + "learning_rate": 2.5462571616417775e-05, + "loss": 0.7414, + "step": 6408, + "task_loss": 0.8244444727897644 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.172795295715332, + "epoch": 5.42, + "learning_rate": 2.5457875457875458e-05, + "loss": 0.8842, + "step": 6409, + "task_loss": 0.9633015990257263 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.4870212078094482, + "epoch": 5.42, + "learning_rate": 2.5453179299333147e-05, + "loss": 1.1255, + "step": 6410, + "task_loss": 1.8594094514846802 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.5139474868774414, + "epoch": 5.42, + "learning_rate": 2.5448483140790834e-05, + "loss": 0.9578, + "step": 6411, + "task_loss": 1.861824631690979 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.4425194263458252, + "epoch": 5.42, + "learning_rate": 2.5443786982248524e-05, + "loss": 1.3059, + "step": 6412, + "task_loss": 1.045417070388794 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9047155380249023, + "epoch": 5.42, + "learning_rate": 2.5439090823706206e-05, + "loss": 0.7934, + "step": 6413, + "task_loss": 1.5678315162658691 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.4728127717971802, + "epoch": 5.42, + "learning_rate": 2.5434394665163896e-05, + "loss": 1.0935, + "step": 6414, + "task_loss": 2.218679904937744 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.1783052682876587, + "epoch": 5.42, + "learning_rate": 2.5429698506621586e-05, + "loss": 1.039, + "step": 6415, + "task_loss": 0.8370128273963928 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.5235366821289062, + "epoch": 5.42, + "learning_rate": 2.5425002348079276e-05, + "loss": 1.0823, + "step": 6416, + "task_loss": 1.5622249841690063 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.6198564767837524, + "epoch": 5.42, + "learning_rate": 2.542030618953696e-05, + "loss": 1.3681, + "step": 6417, + "task_loss": 2.1341352462768555 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9066821336746216, + "epoch": 5.42, + "learning_rate": 2.5415610030994645e-05, + "loss": 1.0496, + "step": 6418, + "task_loss": 0.7548201084136963 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.037347674369812, + "epoch": 5.43, + "learning_rate": 2.5410913872452335e-05, + "loss": 1.086, + "step": 6419, + "task_loss": 1.3194986581802368 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.062352180480957, + "epoch": 5.43, + "learning_rate": 2.5406217713910024e-05, + "loss": 1.201, + "step": 6420, + "task_loss": 0.6786300539970398 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9179903268814087, + "epoch": 5.43, + "learning_rate": 2.5401521555367707e-05, + "loss": 0.8355, + "step": 6421, + "task_loss": 1.4967608451843262 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0355240106582642, + "epoch": 5.43, + "learning_rate": 2.5396825396825397e-05, + "loss": 0.8997, + "step": 6422, + "task_loss": 0.616295337677002 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0410408973693848, + "epoch": 5.43, + "learning_rate": 2.5392129238283087e-05, + "loss": 1.0801, + "step": 6423, + "task_loss": 0.7140810489654541 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6238728165626526, + "epoch": 5.43, + "learning_rate": 2.5387433079740773e-05, + "loss": 0.8609, + "step": 6424, + "task_loss": 0.3634946942329407 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5095426440238953, + "epoch": 5.43, + "learning_rate": 2.5382736921198463e-05, + "loss": 0.8151, + "step": 6425, + "task_loss": 0.16483628749847412 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0267832279205322, + "epoch": 5.43, + "learning_rate": 2.5378040762656146e-05, + "loss": 0.8936, + "step": 6426, + "task_loss": 0.7907918691635132 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.3954848051071167, + "epoch": 5.43, + "learning_rate": 2.5373344604113836e-05, + "loss": 1.0977, + "step": 6427, + "task_loss": 1.5698412656784058 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9310122728347778, + "epoch": 5.43, + "learning_rate": 2.5368648445571525e-05, + "loss": 0.8873, + "step": 6428, + "task_loss": 0.8901378512382507 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.1874616146087646, + "epoch": 5.43, + "learning_rate": 2.5363952287029215e-05, + "loss": 1.1701, + "step": 6429, + "task_loss": 0.5387741327285767 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7063838243484497, + "epoch": 5.44, + "learning_rate": 2.5359256128486898e-05, + "loss": 0.7329, + "step": 6430, + "task_loss": 0.17581811547279358 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0568475723266602, + "epoch": 5.44, + "learning_rate": 2.5354559969944584e-05, + "loss": 1.2625, + "step": 6431, + "task_loss": 0.5060216188430786 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6904717087745667, + "epoch": 5.44, + "learning_rate": 2.5349863811402274e-05, + "loss": 0.7078, + "step": 6432, + "task_loss": 0.405549019575119 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9340455532073975, + "epoch": 5.44, + "learning_rate": 2.5345167652859964e-05, + "loss": 1.2024, + "step": 6433, + "task_loss": 1.4663316011428833 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0381760597229004, + "epoch": 5.44, + "learning_rate": 2.5340471494317647e-05, + "loss": 1.2214, + "step": 6434, + "task_loss": 1.3641736507415771 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.1436653137207031, + "epoch": 5.44, + "learning_rate": 2.5335775335775336e-05, + "loss": 0.8764, + "step": 6435, + "task_loss": 0.855593740940094 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9543707370758057, + "epoch": 5.44, + "learning_rate": 2.5331079177233026e-05, + "loss": 0.7474, + "step": 6436, + "task_loss": 0.2517108917236328 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6863232851028442, + "epoch": 5.44, + "learning_rate": 2.5326383018690712e-05, + "loss": 1.1251, + "step": 6437, + "task_loss": 1.0124092102050781 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6661593914031982, + "epoch": 5.44, + "learning_rate": 2.5321686860148402e-05, + "loss": 0.7665, + "step": 6438, + "task_loss": 0.5492129325866699 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6640273928642273, + "epoch": 5.44, + "learning_rate": 2.5316990701606085e-05, + "loss": 0.7951, + "step": 6439, + "task_loss": 0.9891870021820068 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8081628084182739, + "epoch": 5.44, + "learning_rate": 2.5312294543063775e-05, + "loss": 0.9379, + "step": 6440, + "task_loss": 1.1161000728607178 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.2484506368637085, + "epoch": 5.44, + "learning_rate": 2.5307598384521465e-05, + "loss": 1.0055, + "step": 6441, + "task_loss": 1.342250943183899 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0063486099243164, + "epoch": 5.45, + "learning_rate": 2.530290222597915e-05, + "loss": 1.1021, + "step": 6442, + "task_loss": 1.7969468832015991 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9339218139648438, + "epoch": 5.45, + "learning_rate": 2.5298206067436837e-05, + "loss": 0.8242, + "step": 6443, + "task_loss": 1.0983366966247559 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 2.0517544746398926, + "epoch": 5.45, + "learning_rate": 2.5293509908894524e-05, + "loss": 1.0788, + "step": 6444, + "task_loss": 1.6302683353424072 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.5570580959320068, + "epoch": 5.45, + "learning_rate": 2.5288813750352213e-05, + "loss": 1.2728, + "step": 6445, + "task_loss": 0.9016123414039612 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 2.042505979537964, + "epoch": 5.45, + "learning_rate": 2.5284117591809903e-05, + "loss": 1.3635, + "step": 6446, + "task_loss": 2.062483549118042 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9029207229614258, + "epoch": 5.45, + "learning_rate": 2.5279421433267586e-05, + "loss": 0.9055, + "step": 6447, + "task_loss": 0.47686198353767395 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.1344373226165771, + "epoch": 5.45, + "learning_rate": 2.5274725274725276e-05, + "loss": 1.0683, + "step": 6448, + "task_loss": 1.6097692251205444 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.5122966766357422, + "epoch": 5.45, + "learning_rate": 2.5270029116182966e-05, + "loss": 1.0566, + "step": 6449, + "task_loss": 1.5595813989639282 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 2.3242008686065674, + "epoch": 5.45, + "learning_rate": 2.5265332957640652e-05, + "loss": 1.1961, + "step": 6450, + "task_loss": 0.9449503421783447 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.2608683109283447, + "epoch": 5.45, + "learning_rate": 2.5260636799098335e-05, + "loss": 0.9495, + "step": 6451, + "task_loss": 1.287846565246582 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7174100875854492, + "epoch": 5.45, + "learning_rate": 2.5255940640556025e-05, + "loss": 1.059, + "step": 6452, + "task_loss": 0.5379125475883484 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6162874698638916, + "epoch": 5.45, + "learning_rate": 2.5251244482013714e-05, + "loss": 0.9009, + "step": 6453, + "task_loss": 1.0124763250350952 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.2456603050231934, + "epoch": 5.46, + "learning_rate": 2.5246548323471404e-05, + "loss": 1.2965, + "step": 6454, + "task_loss": 1.1493887901306152 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9956827163696289, + "epoch": 5.46, + "learning_rate": 2.524185216492909e-05, + "loss": 0.9841, + "step": 6455, + "task_loss": 0.7108720541000366 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8716190457344055, + "epoch": 5.46, + "learning_rate": 2.5237156006386777e-05, + "loss": 1.0453, + "step": 6456, + "task_loss": 0.8884397149085999 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6005128622055054, + "epoch": 5.46, + "learning_rate": 2.5232459847844463e-05, + "loss": 0.8864, + "step": 6457, + "task_loss": 0.6114558577537537 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0184158086776733, + "epoch": 5.46, + "learning_rate": 2.5227763689302153e-05, + "loss": 0.8465, + "step": 6458, + "task_loss": 0.522857129573822 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0012152194976807, + "epoch": 5.46, + "learning_rate": 2.5223067530759842e-05, + "loss": 0.8736, + "step": 6459, + "task_loss": 0.3753816485404968 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.572245478630066, + "epoch": 5.46, + "learning_rate": 2.5218371372217525e-05, + "loss": 1.1394, + "step": 6460, + "task_loss": 1.2705318927764893 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.3130083084106445, + "epoch": 5.46, + "learning_rate": 2.5213675213675215e-05, + "loss": 1.0463, + "step": 6461, + "task_loss": 1.9780476093292236 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9595435857772827, + "epoch": 5.46, + "learning_rate": 2.52089790551329e-05, + "loss": 0.7901, + "step": 6462, + "task_loss": 0.6375880241394043 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6294265985488892, + "epoch": 5.46, + "learning_rate": 2.520428289659059e-05, + "loss": 0.9821, + "step": 6463, + "task_loss": 0.4364987313747406 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9522368311882019, + "epoch": 5.46, + "learning_rate": 2.5199586738048274e-05, + "loss": 0.8825, + "step": 6464, + "task_loss": 0.6067593097686768 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.031280517578125, + "epoch": 5.46, + "learning_rate": 2.5194890579505964e-05, + "loss": 1.0753, + "step": 6465, + "task_loss": 2.067537784576416 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9081988334655762, + "epoch": 5.47, + "learning_rate": 2.5190194420963654e-05, + "loss": 0.9152, + "step": 6466, + "task_loss": 0.5675429105758667 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.1158198118209839, + "epoch": 5.47, + "learning_rate": 2.5185498262421343e-05, + "loss": 1.0251, + "step": 6467, + "task_loss": 1.5512065887451172 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.5131148099899292, + "epoch": 5.47, + "learning_rate": 2.518080210387903e-05, + "loss": 0.9735, + "step": 6468, + "task_loss": 0.6444335579872131 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 2.117063045501709, + "epoch": 5.47, + "learning_rate": 2.5176105945336713e-05, + "loss": 1.369, + "step": 6469, + "task_loss": 2.939657688140869 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5342209339141846, + "epoch": 5.47, + "learning_rate": 2.5171409786794402e-05, + "loss": 0.8961, + "step": 6470, + "task_loss": 0.44115594029426575 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.410351037979126, + "epoch": 5.47, + "learning_rate": 2.5166713628252092e-05, + "loss": 1.0599, + "step": 6471, + "task_loss": 1.2014350891113281 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7803982496261597, + "epoch": 5.47, + "learning_rate": 2.5162017469709782e-05, + "loss": 0.897, + "step": 6472, + "task_loss": 0.6406145095825195 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.2381980419158936, + "epoch": 5.47, + "learning_rate": 2.5157321311167465e-05, + "loss": 1.0642, + "step": 6473, + "task_loss": 1.8437703847885132 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.2310930490493774, + "epoch": 5.47, + "learning_rate": 2.5152625152625155e-05, + "loss": 1.0296, + "step": 6474, + "task_loss": 1.4089546203613281 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.2360575199127197, + "epoch": 5.47, + "learning_rate": 2.514792899408284e-05, + "loss": 0.7497, + "step": 6475, + "task_loss": 0.6520443558692932 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6642756462097168, + "epoch": 5.47, + "learning_rate": 2.514323283554053e-05, + "loss": 0.8133, + "step": 6476, + "task_loss": 1.9031150341033936 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.697030782699585, + "epoch": 5.47, + "learning_rate": 2.5138536676998214e-05, + "loss": 1.1608, + "step": 6477, + "task_loss": 1.2578227519989014 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.2124602794647217, + "epoch": 5.48, + "learning_rate": 2.5133840518455903e-05, + "loss": 1.0097, + "step": 6478, + "task_loss": 1.5414373874664307 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6035012006759644, + "epoch": 5.48, + "learning_rate": 2.5129144359913593e-05, + "loss": 0.9916, + "step": 6479, + "task_loss": 0.37748512625694275 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.839076042175293, + "epoch": 5.48, + "learning_rate": 2.5124448201371283e-05, + "loss": 0.8477, + "step": 6480, + "task_loss": 0.9630509614944458 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9062399864196777, + "epoch": 5.48, + "learning_rate": 2.5119752042828966e-05, + "loss": 0.7406, + "step": 6481, + "task_loss": 0.5637525320053101 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.426987886428833, + "epoch": 5.48, + "learning_rate": 2.5115055884286652e-05, + "loss": 1.4049, + "step": 6482, + "task_loss": 1.1638402938842773 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.1536519527435303, + "epoch": 5.48, + "learning_rate": 2.5110359725744342e-05, + "loss": 1.0041, + "step": 6483, + "task_loss": 0.666979193687439 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.410020112991333, + "epoch": 5.48, + "learning_rate": 2.510566356720203e-05, + "loss": 1.0886, + "step": 6484, + "task_loss": 0.9850988388061523 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0810198783874512, + "epoch": 5.48, + "learning_rate": 2.510096740865972e-05, + "loss": 1.1591, + "step": 6485, + "task_loss": 1.395552635192871 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9863530993461609, + "epoch": 5.48, + "learning_rate": 2.5096271250117404e-05, + "loss": 1.1201, + "step": 6486, + "task_loss": 1.1136263608932495 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7814576029777527, + "epoch": 5.48, + "learning_rate": 2.5091575091575094e-05, + "loss": 1.1302, + "step": 6487, + "task_loss": 0.8994828462600708 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.1980841159820557, + "epoch": 5.48, + "learning_rate": 2.508687893303278e-05, + "loss": 1.0086, + "step": 6488, + "task_loss": 0.6037781238555908 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0863840579986572, + "epoch": 5.48, + "learning_rate": 2.508218277449047e-05, + "loss": 1.0447, + "step": 6489, + "task_loss": 0.7677891850471497 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.4626713991165161, + "epoch": 5.49, + "learning_rate": 2.5077486615948153e-05, + "loss": 1.1299, + "step": 6490, + "task_loss": 1.5354024171829224 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8204333782196045, + "epoch": 5.49, + "learning_rate": 2.5072790457405843e-05, + "loss": 1.0671, + "step": 6491, + "task_loss": 1.9802935123443604 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0774654150009155, + "epoch": 5.49, + "learning_rate": 2.5068094298863532e-05, + "loss": 0.7544, + "step": 6492, + "task_loss": 0.3735019564628601 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.3140161037445068, + "epoch": 5.49, + "learning_rate": 2.506339814032122e-05, + "loss": 1.0066, + "step": 6493, + "task_loss": 1.0784987211227417 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0194470882415771, + "epoch": 5.49, + "learning_rate": 2.5058701981778905e-05, + "loss": 0.8462, + "step": 6494, + "task_loss": 0.5586204528808594 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6462347507476807, + "epoch": 5.49, + "learning_rate": 2.505400582323659e-05, + "loss": 1.207, + "step": 6495, + "task_loss": 0.7948214411735535 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.2999684810638428, + "epoch": 5.49, + "learning_rate": 2.504930966469428e-05, + "loss": 1.0452, + "step": 6496, + "task_loss": 1.4566434621810913 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.2516119480133057, + "epoch": 5.49, + "learning_rate": 2.504461350615197e-05, + "loss": 0.9019, + "step": 6497, + "task_loss": 1.380259394645691 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8307926654815674, + "epoch": 5.49, + "learning_rate": 2.5039917347609654e-05, + "loss": 0.8174, + "step": 6498, + "task_loss": 0.9373601078987122 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0203444957733154, + "epoch": 5.49, + "learning_rate": 2.5035221189067343e-05, + "loss": 0.7014, + "step": 6499, + "task_loss": 0.8968945741653442 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.3474643230438232, + "epoch": 5.49, + "learning_rate": 2.5030525030525033e-05, + "loss": 1.0668, + "step": 6500, + "task_loss": 0.5559388399124146 + }, + { + "epoch": 5.49, + "eval_accuracy": 0.873980198019802, + "eval_loss": 0.605010449886322, + "eval_runtime": 225.7443, + "eval_samples_per_second": 111.852, + "eval_steps_per_second": 0.877, + "step": 6500 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6095402836799622, + "epoch": 5.5, + "learning_rate": 2.502582887198272e-05, + "loss": 0.802, + "step": 6501, + "task_loss": 0.6356408596038818 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7141250967979431, + "epoch": 5.5, + "learning_rate": 2.502113271344041e-05, + "loss": 0.8824, + "step": 6502, + "task_loss": 0.9097650051116943 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8958068490028381, + "epoch": 5.5, + "learning_rate": 2.5016436554898092e-05, + "loss": 1.0119, + "step": 6503, + "task_loss": 0.6776978969573975 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.398125410079956, + "epoch": 5.5, + "learning_rate": 2.5011740396355782e-05, + "loss": 1.1917, + "step": 6504, + "task_loss": 1.3158059120178223 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.4891916513442993, + "epoch": 5.5, + "learning_rate": 2.500704423781347e-05, + "loss": 0.852, + "step": 6505, + "task_loss": 0.9764769077301025 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.1734192371368408, + "epoch": 5.5, + "learning_rate": 2.5002348079271158e-05, + "loss": 1.0656, + "step": 6506, + "task_loss": 1.528217077255249 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8117177486419678, + "epoch": 5.5, + "learning_rate": 2.4997651920728844e-05, + "loss": 0.9229, + "step": 6507, + "task_loss": 1.1204307079315186 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7633861899375916, + "epoch": 5.5, + "learning_rate": 2.499295576218653e-05, + "loss": 0.9314, + "step": 6508, + "task_loss": 0.5120180249214172 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0024962425231934, + "epoch": 5.5, + "learning_rate": 2.498825960364422e-05, + "loss": 1.0553, + "step": 6509, + "task_loss": 0.9422429203987122 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6643247604370117, + "epoch": 5.5, + "learning_rate": 2.4983563445101907e-05, + "loss": 0.8235, + "step": 6510, + "task_loss": 0.928342342376709 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.6740977764129639, + "epoch": 5.5, + "learning_rate": 2.4978867286559597e-05, + "loss": 1.2143, + "step": 6511, + "task_loss": 1.1002947092056274 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.2026362419128418, + "epoch": 5.5, + "learning_rate": 2.4974171128017283e-05, + "loss": 1.0856, + "step": 6512, + "task_loss": 1.3967161178588867 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.417423129081726, + "epoch": 5.51, + "learning_rate": 2.496947496947497e-05, + "loss": 0.9627, + "step": 6513, + "task_loss": 1.164745807647705 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9823552370071411, + "epoch": 5.51, + "learning_rate": 2.4964778810932656e-05, + "loss": 0.9378, + "step": 6514, + "task_loss": 1.4372062683105469 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.463081955909729, + "epoch": 5.51, + "learning_rate": 2.4960082652390345e-05, + "loss": 0.8495, + "step": 6515, + "task_loss": 1.3777320384979248 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7449542880058289, + "epoch": 5.51, + "learning_rate": 2.4955386493848035e-05, + "loss": 0.9332, + "step": 6516, + "task_loss": 0.9524458646774292 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.097550392150879, + "epoch": 5.51, + "learning_rate": 2.495069033530572e-05, + "loss": 0.9055, + "step": 6517, + "task_loss": 0.8129556179046631 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.516600251197815, + "epoch": 5.51, + "learning_rate": 2.494599417676341e-05, + "loss": 1.0558, + "step": 6518, + "task_loss": 1.2795346975326538 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8112541437149048, + "epoch": 5.51, + "learning_rate": 2.4941298018221097e-05, + "loss": 1.153, + "step": 6519, + "task_loss": 1.0764656066894531 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9789175987243652, + "epoch": 5.51, + "learning_rate": 2.4936601859678784e-05, + "loss": 0.9791, + "step": 6520, + "task_loss": 0.7618276476860046 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.289778232574463, + "epoch": 5.51, + "learning_rate": 2.493190570113647e-05, + "loss": 1.1099, + "step": 6521, + "task_loss": 0.5469610691070557 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9881772994995117, + "epoch": 5.51, + "learning_rate": 2.492720954259416e-05, + "loss": 0.7864, + "step": 6522, + "task_loss": 1.1781964302062988 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.2679753303527832, + "epoch": 5.51, + "learning_rate": 2.4922513384051846e-05, + "loss": 1.1556, + "step": 6523, + "task_loss": 1.5470633506774902 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6113263964653015, + "epoch": 5.51, + "learning_rate": 2.4917817225509536e-05, + "loss": 0.8729, + "step": 6524, + "task_loss": 0.2104288637638092 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9389169812202454, + "epoch": 5.52, + "learning_rate": 2.4913121066967222e-05, + "loss": 0.8711, + "step": 6525, + "task_loss": 0.9308289289474487 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7133930325508118, + "epoch": 5.52, + "learning_rate": 2.490842490842491e-05, + "loss": 1.1651, + "step": 6526, + "task_loss": 0.25973373651504517 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.632519543170929, + "epoch": 5.52, + "learning_rate": 2.4903728749882595e-05, + "loss": 0.7514, + "step": 6527, + "task_loss": 0.49152323603630066 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.2376489639282227, + "epoch": 5.52, + "learning_rate": 2.4899032591340285e-05, + "loss": 1.2604, + "step": 6528, + "task_loss": 1.007156252861023 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8465752601623535, + "epoch": 5.52, + "learning_rate": 2.489433643279797e-05, + "loss": 0.9575, + "step": 6529, + "task_loss": 0.8186055421829224 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.1092650890350342, + "epoch": 5.52, + "learning_rate": 2.488964027425566e-05, + "loss": 1.1083, + "step": 6530, + "task_loss": 1.4498447179794312 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7407197952270508, + "epoch": 5.52, + "learning_rate": 2.488494411571335e-05, + "loss": 0.8756, + "step": 6531, + "task_loss": 1.1178863048553467 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.34309059381484985, + "epoch": 5.52, + "learning_rate": 2.4880247957171037e-05, + "loss": 0.6501, + "step": 6532, + "task_loss": 0.2862367331981659 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0120652914047241, + "epoch": 5.52, + "learning_rate": 2.4875551798628723e-05, + "loss": 1.0923, + "step": 6533, + "task_loss": 1.1884217262268066 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.733035147190094, + "epoch": 5.52, + "learning_rate": 2.487085564008641e-05, + "loss": 0.9447, + "step": 6534, + "task_loss": 0.7153406739234924 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.3868988752365112, + "epoch": 5.52, + "learning_rate": 2.48661594815441e-05, + "loss": 1.0083, + "step": 6535, + "task_loss": 2.1528139114379883 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.579030454158783, + "epoch": 5.52, + "learning_rate": 2.4861463323001785e-05, + "loss": 1.095, + "step": 6536, + "task_loss": 0.6510944366455078 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.1954929828643799, + "epoch": 5.53, + "learning_rate": 2.4856767164459475e-05, + "loss": 1.2041, + "step": 6537, + "task_loss": 1.4745815992355347 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9510072469711304, + "epoch": 5.53, + "learning_rate": 2.485207100591716e-05, + "loss": 0.9835, + "step": 6538, + "task_loss": 0.5887629985809326 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.1454983949661255, + "epoch": 5.53, + "learning_rate": 2.4847374847374848e-05, + "loss": 0.9722, + "step": 6539, + "task_loss": 1.276520013809204 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0835111141204834, + "epoch": 5.53, + "learning_rate": 2.4842678688832534e-05, + "loss": 1.1358, + "step": 6540, + "task_loss": 0.9604179859161377 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.5456302165985107, + "epoch": 5.53, + "learning_rate": 2.4837982530290224e-05, + "loss": 1.2501, + "step": 6541, + "task_loss": 1.2436788082122803 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.1599609851837158, + "epoch": 5.53, + "learning_rate": 2.483328637174791e-05, + "loss": 0.9358, + "step": 6542, + "task_loss": 0.6180224418640137 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7082446813583374, + "epoch": 5.53, + "learning_rate": 2.48285902132056e-05, + "loss": 0.9288, + "step": 6543, + "task_loss": 0.9486300349235535 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8146706819534302, + "epoch": 5.53, + "learning_rate": 2.4823894054663286e-05, + "loss": 0.9999, + "step": 6544, + "task_loss": 1.2737942934036255 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5037171244621277, + "epoch": 5.53, + "learning_rate": 2.4819197896120973e-05, + "loss": 0.6562, + "step": 6545, + "task_loss": 0.359212726354599 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4646756947040558, + "epoch": 5.53, + "learning_rate": 2.4814501737578662e-05, + "loss": 0.7019, + "step": 6546, + "task_loss": 0.7566305994987488 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5524226427078247, + "epoch": 5.53, + "learning_rate": 2.480980557903635e-05, + "loss": 1.1367, + "step": 6547, + "task_loss": 0.037734489887952805 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9199973940849304, + "epoch": 5.53, + "learning_rate": 2.480510942049404e-05, + "loss": 0.929, + "step": 6548, + "task_loss": 0.3293412923812866 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7145240306854248, + "epoch": 5.54, + "learning_rate": 2.4800413261951725e-05, + "loss": 0.9385, + "step": 6549, + "task_loss": 1.1084599494934082 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.1709107160568237, + "epoch": 5.54, + "learning_rate": 2.4795717103409415e-05, + "loss": 0.9251, + "step": 6550, + "task_loss": 1.1088004112243652 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7485247254371643, + "epoch": 5.54, + "learning_rate": 2.47910209448671e-05, + "loss": 0.7682, + "step": 6551, + "task_loss": 0.815598726272583 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6231138706207275, + "epoch": 5.54, + "learning_rate": 2.4786324786324787e-05, + "loss": 0.9299, + "step": 6552, + "task_loss": 1.2794914245605469 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7162492871284485, + "epoch": 5.54, + "learning_rate": 2.4781628627782474e-05, + "loss": 0.9879, + "step": 6553, + "task_loss": 0.5725975632667542 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.915233314037323, + "epoch": 5.54, + "learning_rate": 2.4776932469240163e-05, + "loss": 1.1898, + "step": 6554, + "task_loss": 1.13390052318573 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9578989744186401, + "epoch": 5.54, + "learning_rate": 2.477223631069785e-05, + "loss": 0.8763, + "step": 6555, + "task_loss": 0.8744196891784668 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5787204504013062, + "epoch": 5.54, + "learning_rate": 2.476754015215554e-05, + "loss": 0.9306, + "step": 6556, + "task_loss": 0.6602758765220642 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5607211589813232, + "epoch": 5.54, + "learning_rate": 2.4762843993613226e-05, + "loss": 0.7748, + "step": 6557, + "task_loss": 1.2263411283493042 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.1356621980667114, + "epoch": 5.54, + "learning_rate": 2.4758147835070912e-05, + "loss": 1.0768, + "step": 6558, + "task_loss": 1.7146135568618774 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.689918041229248, + "epoch": 5.54, + "learning_rate": 2.47534516765286e-05, + "loss": 0.8194, + "step": 6559, + "task_loss": 0.4123992323875427 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4333256781101227, + "epoch": 5.54, + "learning_rate": 2.4748755517986288e-05, + "loss": 0.7273, + "step": 6560, + "task_loss": 0.5912057757377625 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.2082090377807617, + "epoch": 5.55, + "learning_rate": 2.4744059359443978e-05, + "loss": 1.0529, + "step": 6561, + "task_loss": 1.2178003787994385 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.495344877243042, + "epoch": 5.55, + "learning_rate": 2.4739363200901664e-05, + "loss": 1.1026, + "step": 6562, + "task_loss": 1.2726677656173706 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.785637617111206, + "epoch": 5.55, + "learning_rate": 2.4734667042359354e-05, + "loss": 0.7171, + "step": 6563, + "task_loss": 0.5261896848678589 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5374695658683777, + "epoch": 5.55, + "learning_rate": 2.4729970883817037e-05, + "loss": 1.0461, + "step": 6564, + "task_loss": 0.7869671583175659 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.035266637802124, + "epoch": 5.55, + "learning_rate": 2.4725274725274727e-05, + "loss": 0.9079, + "step": 6565, + "task_loss": 1.838982105255127 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.878603994846344, + "epoch": 5.55, + "learning_rate": 2.4720578566732413e-05, + "loss": 0.9438, + "step": 6566, + "task_loss": 1.0689141750335693 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.1558685302734375, + "epoch": 5.55, + "learning_rate": 2.4715882408190103e-05, + "loss": 0.8151, + "step": 6567, + "task_loss": 0.493290513753891 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6507989168167114, + "epoch": 5.55, + "learning_rate": 2.471118624964779e-05, + "loss": 0.9318, + "step": 6568, + "task_loss": 1.5130853652954102 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9253194332122803, + "epoch": 5.55, + "learning_rate": 2.470649009110548e-05, + "loss": 1.2166, + "step": 6569, + "task_loss": 0.41528743505477905 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.4330897331237793, + "epoch": 5.55, + "learning_rate": 2.4701793932563165e-05, + "loss": 0.8243, + "step": 6570, + "task_loss": 2.011565685272217 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.149446964263916, + "epoch": 5.55, + "learning_rate": 2.469709777402085e-05, + "loss": 0.9914, + "step": 6571, + "task_loss": 1.4145113229751587 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0650615692138672, + "epoch": 5.56, + "learning_rate": 2.4692401615478538e-05, + "loss": 0.9216, + "step": 6572, + "task_loss": 0.7961815595626831 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0481393337249756, + "epoch": 5.56, + "learning_rate": 2.4687705456936227e-05, + "loss": 0.8509, + "step": 6573, + "task_loss": 0.45247191190719604 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0091030597686768, + "epoch": 5.56, + "learning_rate": 2.4683009298393914e-05, + "loss": 0.926, + "step": 6574, + "task_loss": 1.0393537282943726 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0851750373840332, + "epoch": 5.56, + "learning_rate": 2.4678313139851604e-05, + "loss": 0.8587, + "step": 6575, + "task_loss": 1.1208674907684326 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9263315796852112, + "epoch": 5.56, + "learning_rate": 2.467361698130929e-05, + "loss": 1.138, + "step": 6576, + "task_loss": 0.6099129319190979 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.2647969722747803, + "epoch": 5.56, + "learning_rate": 2.4668920822766976e-05, + "loss": 0.9265, + "step": 6577, + "task_loss": 1.2099357843399048 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0278624296188354, + "epoch": 5.56, + "learning_rate": 2.4664224664224666e-05, + "loss": 0.966, + "step": 6578, + "task_loss": 1.6882303953170776 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.40831315517425537, + "epoch": 5.56, + "learning_rate": 2.4659528505682352e-05, + "loss": 0.6916, + "step": 6579, + "task_loss": 0.1509709507226944 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8878034353256226, + "epoch": 5.56, + "learning_rate": 2.4654832347140042e-05, + "loss": 0.788, + "step": 6580, + "task_loss": 1.2449076175689697 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6834591627120972, + "epoch": 5.56, + "learning_rate": 2.465013618859773e-05, + "loss": 0.8149, + "step": 6581, + "task_loss": 0.617369532585144 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0745313167572021, + "epoch": 5.56, + "learning_rate": 2.4645440030055418e-05, + "loss": 0.9187, + "step": 6582, + "task_loss": 2.049734592437744 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0423119068145752, + "epoch": 5.56, + "learning_rate": 2.4640743871513104e-05, + "loss": 1.0125, + "step": 6583, + "task_loss": 0.963222086429596 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.712994396686554, + "epoch": 5.57, + "learning_rate": 2.463604771297079e-05, + "loss": 0.7283, + "step": 6584, + "task_loss": 0.749672532081604 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8948954939842224, + "epoch": 5.57, + "learning_rate": 2.4631351554428477e-05, + "loss": 0.7689, + "step": 6585, + "task_loss": 1.0379952192306519 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7258709669113159, + "epoch": 5.57, + "learning_rate": 2.4626655395886167e-05, + "loss": 0.9312, + "step": 6586, + "task_loss": 0.9954637885093689 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9688941240310669, + "epoch": 5.57, + "learning_rate": 2.4621959237343853e-05, + "loss": 0.8991, + "step": 6587, + "task_loss": 0.9115315675735474 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5612451434135437, + "epoch": 5.57, + "learning_rate": 2.4617263078801543e-05, + "loss": 0.7929, + "step": 6588, + "task_loss": 0.45403170585632324 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7728501558303833, + "epoch": 5.57, + "learning_rate": 2.461256692025923e-05, + "loss": 0.8073, + "step": 6589, + "task_loss": 0.8065351843833923 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9504995346069336, + "epoch": 5.57, + "learning_rate": 2.4607870761716916e-05, + "loss": 0.9986, + "step": 6590, + "task_loss": 1.0132135152816772 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.986470639705658, + "epoch": 5.57, + "learning_rate": 2.4603174603174602e-05, + "loss": 1.0332, + "step": 6591, + "task_loss": 0.8361361026763916 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5902015566825867, + "epoch": 5.57, + "learning_rate": 2.459847844463229e-05, + "loss": 0.7914, + "step": 6592, + "task_loss": 0.7981234192848206 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8932263851165771, + "epoch": 5.57, + "learning_rate": 2.459378228608998e-05, + "loss": 0.9073, + "step": 6593, + "task_loss": 0.43015536665916443 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9800308346748352, + "epoch": 5.57, + "learning_rate": 2.4589086127547668e-05, + "loss": 1.1785, + "step": 6594, + "task_loss": 1.1460130214691162 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.5378355979919434, + "epoch": 5.57, + "learning_rate": 2.4584389969005357e-05, + "loss": 1.2178, + "step": 6595, + "task_loss": 1.2029228210449219 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 2.266120672225952, + "epoch": 5.58, + "learning_rate": 2.457969381046304e-05, + "loss": 1.3393, + "step": 6596, + "task_loss": 1.5405868291854858 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7765187621116638, + "epoch": 5.58, + "learning_rate": 2.457499765192073e-05, + "loss": 0.8651, + "step": 6597, + "task_loss": 0.1597270369529724 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6172748804092407, + "epoch": 5.58, + "learning_rate": 2.4570301493378416e-05, + "loss": 0.828, + "step": 6598, + "task_loss": 0.2836361825466156 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8816103339195251, + "epoch": 5.58, + "learning_rate": 2.4565605334836106e-05, + "loss": 0.9148, + "step": 6599, + "task_loss": 0.6980390548706055 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9197078347206116, + "epoch": 5.58, + "learning_rate": 2.4560909176293793e-05, + "loss": 0.9036, + "step": 6600, + "task_loss": 1.1312909126281738 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.1042873859405518, + "epoch": 5.58, + "learning_rate": 2.4556213017751482e-05, + "loss": 0.9931, + "step": 6601, + "task_loss": 1.4244621992111206 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6930620074272156, + "epoch": 5.58, + "learning_rate": 2.455151685920917e-05, + "loss": 0.8476, + "step": 6602, + "task_loss": 0.7543149590492249 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.3054134845733643, + "epoch": 5.58, + "learning_rate": 2.4546820700666855e-05, + "loss": 0.9814, + "step": 6603, + "task_loss": 1.3226145505905151 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.2858620882034302, + "epoch": 5.58, + "learning_rate": 2.454212454212454e-05, + "loss": 1.0481, + "step": 6604, + "task_loss": 0.666312038898468 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6937057971954346, + "epoch": 5.58, + "learning_rate": 2.453742838358223e-05, + "loss": 0.7125, + "step": 6605, + "task_loss": 0.8389298915863037 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.41905003786087036, + "epoch": 5.58, + "learning_rate": 2.4532732225039917e-05, + "loss": 0.5738, + "step": 6606, + "task_loss": 0.39687731862068176 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6398493051528931, + "epoch": 5.58, + "learning_rate": 2.4528036066497607e-05, + "loss": 1.0518, + "step": 6607, + "task_loss": 0.8782630562782288 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0880212783813477, + "epoch": 5.59, + "learning_rate": 2.4523339907955293e-05, + "loss": 0.9483, + "step": 6608, + "task_loss": 0.9946067333221436 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.5420563220977783, + "epoch": 5.59, + "learning_rate": 2.451864374941298e-05, + "loss": 1.106, + "step": 6609, + "task_loss": 1.0222811698913574 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8119205832481384, + "epoch": 5.59, + "learning_rate": 2.451394759087067e-05, + "loss": 0.8444, + "step": 6610, + "task_loss": 1.0791847705841064 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5991459488868713, + "epoch": 5.59, + "learning_rate": 2.4509251432328356e-05, + "loss": 0.9086, + "step": 6611, + "task_loss": 0.8332309722900391 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.817090630531311, + "epoch": 5.59, + "learning_rate": 2.4504555273786046e-05, + "loss": 0.8717, + "step": 6612, + "task_loss": 1.6698169708251953 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9613997936248779, + "epoch": 5.59, + "learning_rate": 2.4499859115243732e-05, + "loss": 1.0437, + "step": 6613, + "task_loss": 0.5155096650123596 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8208536505699158, + "epoch": 5.59, + "learning_rate": 2.449516295670142e-05, + "loss": 0.8153, + "step": 6614, + "task_loss": 0.4769150912761688 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.44721049070358276, + "epoch": 5.59, + "learning_rate": 2.4490466798159105e-05, + "loss": 0.9333, + "step": 6615, + "task_loss": 0.7820694446563721 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.1996583938598633, + "epoch": 5.59, + "learning_rate": 2.4485770639616794e-05, + "loss": 1.1367, + "step": 6616, + "task_loss": 2.040287494659424 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9728130102157593, + "epoch": 5.59, + "learning_rate": 2.448107448107448e-05, + "loss": 0.7319, + "step": 6617, + "task_loss": 1.1617364883422852 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.855895459651947, + "epoch": 5.59, + "learning_rate": 2.447637832253217e-05, + "loss": 1.0211, + "step": 6618, + "task_loss": 1.2099485397338867 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9172717332839966, + "epoch": 5.59, + "learning_rate": 2.4471682163989857e-05, + "loss": 1.062, + "step": 6619, + "task_loss": 1.65121328830719 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8879387378692627, + "epoch": 5.6, + "learning_rate": 2.4466986005447546e-05, + "loss": 1.0185, + "step": 6620, + "task_loss": 1.0745211839675903 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8266237378120422, + "epoch": 5.6, + "learning_rate": 2.4462289846905233e-05, + "loss": 1.0461, + "step": 6621, + "task_loss": 1.063925862312317 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.630460262298584, + "epoch": 5.6, + "learning_rate": 2.445759368836292e-05, + "loss": 1.1333, + "step": 6622, + "task_loss": 1.3138689994812012 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6859695315361023, + "epoch": 5.6, + "learning_rate": 2.445289752982061e-05, + "loss": 0.8091, + "step": 6623, + "task_loss": 1.1032859086990356 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.946871280670166, + "epoch": 5.6, + "learning_rate": 2.4448201371278295e-05, + "loss": 0.8633, + "step": 6624, + "task_loss": 1.0336421728134155 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6470715403556824, + "epoch": 5.6, + "learning_rate": 2.4443505212735985e-05, + "loss": 0.8356, + "step": 6625, + "task_loss": 0.251172810792923 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.907500147819519, + "epoch": 5.6, + "learning_rate": 2.443880905419367e-05, + "loss": 1.2698, + "step": 6626, + "task_loss": 1.0850331783294678 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.136962652206421, + "epoch": 5.6, + "learning_rate": 2.443411289565136e-05, + "loss": 0.766, + "step": 6627, + "task_loss": 1.5016676187515259 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8989626169204712, + "epoch": 5.6, + "learning_rate": 2.4429416737109044e-05, + "loss": 0.9549, + "step": 6628, + "task_loss": 0.7360947132110596 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5806529521942139, + "epoch": 5.6, + "learning_rate": 2.4424720578566734e-05, + "loss": 0.7508, + "step": 6629, + "task_loss": 0.7200555801391602 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8136186599731445, + "epoch": 5.6, + "learning_rate": 2.442002442002442e-05, + "loss": 0.8487, + "step": 6630, + "task_loss": 0.47695600986480713 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.581135094165802, + "epoch": 5.6, + "learning_rate": 2.441532826148211e-05, + "loss": 0.8187, + "step": 6631, + "task_loss": 1.2110118865966797 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.250044822692871, + "epoch": 5.61, + "learning_rate": 2.4410632102939796e-05, + "loss": 0.9441, + "step": 6632, + "task_loss": 1.0270390510559082 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7059073448181152, + "epoch": 5.61, + "learning_rate": 2.4405935944397486e-05, + "loss": 1.2302, + "step": 6633, + "task_loss": 1.1635041236877441 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.909400224685669, + "epoch": 5.61, + "learning_rate": 2.4401239785855172e-05, + "loss": 1.2041, + "step": 6634, + "task_loss": 0.7382453680038452 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.537803590297699, + "epoch": 5.61, + "learning_rate": 2.439654362731286e-05, + "loss": 0.7804, + "step": 6635, + "task_loss": 0.351135790348053 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 2.058112621307373, + "epoch": 5.61, + "learning_rate": 2.4391847468770545e-05, + "loss": 1.1022, + "step": 6636, + "task_loss": 1.5273407697677612 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7188575863838196, + "epoch": 5.61, + "learning_rate": 2.4387151310228235e-05, + "loss": 0.9305, + "step": 6637, + "task_loss": 0.5073179602622986 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.355460286140442, + "epoch": 5.61, + "learning_rate": 2.4382455151685924e-05, + "loss": 1.1358, + "step": 6638, + "task_loss": 1.515930414199829 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3793998956680298, + "epoch": 5.61, + "learning_rate": 2.437775899314361e-05, + "loss": 0.9288, + "step": 6639, + "task_loss": 0.08861976861953735 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5557202696800232, + "epoch": 5.61, + "learning_rate": 2.4373062834601297e-05, + "loss": 0.9369, + "step": 6640, + "task_loss": 0.7935031652450562 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9381681084632874, + "epoch": 5.61, + "learning_rate": 2.4368366676058983e-05, + "loss": 0.9423, + "step": 6641, + "task_loss": 0.820948600769043 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6585814952850342, + "epoch": 5.61, + "learning_rate": 2.4363670517516673e-05, + "loss": 0.8706, + "step": 6642, + "task_loss": 0.47166207432746887 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7500200271606445, + "epoch": 5.61, + "learning_rate": 2.435897435897436e-05, + "loss": 1.0745, + "step": 6643, + "task_loss": 0.8711224794387817 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7233695387840271, + "epoch": 5.62, + "learning_rate": 2.435427820043205e-05, + "loss": 0.9104, + "step": 6644, + "task_loss": 0.5659667253494263 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8588014245033264, + "epoch": 5.62, + "learning_rate": 2.4349582041889735e-05, + "loss": 1.1046, + "step": 6645, + "task_loss": 0.7058823108673096 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6325877904891968, + "epoch": 5.62, + "learning_rate": 2.4344885883347425e-05, + "loss": 0.8342, + "step": 6646, + "task_loss": 0.1982627958059311 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5055789947509766, + "epoch": 5.62, + "learning_rate": 2.4340189724805108e-05, + "loss": 0.8033, + "step": 6647, + "task_loss": 0.6870417594909668 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.86025071144104, + "epoch": 5.62, + "learning_rate": 2.4335493566262798e-05, + "loss": 0.935, + "step": 6648, + "task_loss": 0.7079517841339111 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.2837390899658203, + "epoch": 5.62, + "learning_rate": 2.4330797407720484e-05, + "loss": 0.9875, + "step": 6649, + "task_loss": 0.7962954044342041 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8910945653915405, + "epoch": 5.62, + "learning_rate": 2.4326101249178174e-05, + "loss": 0.8498, + "step": 6650, + "task_loss": 1.0536521673202515 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.671379566192627, + "epoch": 5.62, + "learning_rate": 2.432140509063586e-05, + "loss": 0.7696, + "step": 6651, + "task_loss": 0.8075859546661377 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7907781600952148, + "epoch": 5.62, + "learning_rate": 2.431670893209355e-05, + "loss": 0.7466, + "step": 6652, + "task_loss": 1.156365156173706 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0830881595611572, + "epoch": 5.62, + "learning_rate": 2.4312012773551236e-05, + "loss": 1.0515, + "step": 6653, + "task_loss": 1.0267937183380127 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.833695113658905, + "epoch": 5.62, + "learning_rate": 2.4307316615008923e-05, + "loss": 0.6692, + "step": 6654, + "task_loss": 0.7679073214530945 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.1598137617111206, + "epoch": 5.63, + "learning_rate": 2.4302620456466612e-05, + "loss": 0.9369, + "step": 6655, + "task_loss": 1.5609127283096313 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8662961721420288, + "epoch": 5.63, + "learning_rate": 2.42979242979243e-05, + "loss": 0.791, + "step": 6656, + "task_loss": 1.138088583946228 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0670026540756226, + "epoch": 5.63, + "learning_rate": 2.429322813938199e-05, + "loss": 0.9298, + "step": 6657, + "task_loss": 1.2662304639816284 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.373207449913025, + "epoch": 5.63, + "learning_rate": 2.4288531980839675e-05, + "loss": 1.1095, + "step": 6658, + "task_loss": 0.8894612789154053 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.786376953125, + "epoch": 5.63, + "learning_rate": 2.428383582229736e-05, + "loss": 0.959, + "step": 6659, + "task_loss": 0.3115834593772888 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.1645961999893188, + "epoch": 5.63, + "learning_rate": 2.4279139663755047e-05, + "loss": 0.9098, + "step": 6660, + "task_loss": 1.06203293800354 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6678271293640137, + "epoch": 5.63, + "learning_rate": 2.4274443505212737e-05, + "loss": 0.8207, + "step": 6661, + "task_loss": 1.1759073734283447 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0283564329147339, + "epoch": 5.63, + "learning_rate": 2.4269747346670424e-05, + "loss": 1.0949, + "step": 6662, + "task_loss": 1.5555089712142944 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7841814756393433, + "epoch": 5.63, + "learning_rate": 2.4265051188128113e-05, + "loss": 0.7616, + "step": 6663, + "task_loss": 0.640498697757721 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5832818746566772, + "epoch": 5.63, + "learning_rate": 2.42603550295858e-05, + "loss": 0.7594, + "step": 6664, + "task_loss": 0.8962552547454834 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.1064479351043701, + "epoch": 5.63, + "learning_rate": 2.425565887104349e-05, + "loss": 1.021, + "step": 6665, + "task_loss": 0.8834772109985352 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6273853778839111, + "epoch": 5.63, + "learning_rate": 2.4250962712501176e-05, + "loss": 0.726, + "step": 6666, + "task_loss": 0.6214941740036011 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9151614904403687, + "epoch": 5.64, + "learning_rate": 2.4246266553958862e-05, + "loss": 0.9807, + "step": 6667, + "task_loss": 1.1253281831741333 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9365876317024231, + "epoch": 5.64, + "learning_rate": 2.424157039541655e-05, + "loss": 0.9357, + "step": 6668, + "task_loss": 0.24669693410396576 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0428639650344849, + "epoch": 5.64, + "learning_rate": 2.4236874236874238e-05, + "loss": 0.8066, + "step": 6669, + "task_loss": 0.5238329172134399 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.1816184520721436, + "epoch": 5.64, + "learning_rate": 2.4232178078331928e-05, + "loss": 1.3074, + "step": 6670, + "task_loss": 0.8739794492721558 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4579183757305145, + "epoch": 5.64, + "learning_rate": 2.4227481919789614e-05, + "loss": 0.6383, + "step": 6671, + "task_loss": 0.6071368455886841 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.847671627998352, + "epoch": 5.64, + "learning_rate": 2.42227857612473e-05, + "loss": 0.7871, + "step": 6672, + "task_loss": 0.7136431932449341 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.100651502609253, + "epoch": 5.64, + "learning_rate": 2.4218089602704987e-05, + "loss": 1.2206, + "step": 6673, + "task_loss": 1.5671916007995605 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0612430572509766, + "epoch": 5.64, + "learning_rate": 2.4213393444162677e-05, + "loss": 0.9326, + "step": 6674, + "task_loss": 0.807921290397644 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5986188650131226, + "epoch": 5.64, + "learning_rate": 2.4208697285620363e-05, + "loss": 0.7661, + "step": 6675, + "task_loss": 0.17334291338920593 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6017254590988159, + "epoch": 5.64, + "learning_rate": 2.4204001127078053e-05, + "loss": 0.7954, + "step": 6676, + "task_loss": 0.4023938775062561 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9478189945220947, + "epoch": 5.64, + "learning_rate": 2.419930496853574e-05, + "loss": 0.7301, + "step": 6677, + "task_loss": 0.9096418619155884 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.42318636178970337, + "epoch": 5.64, + "learning_rate": 2.419460880999343e-05, + "loss": 0.76, + "step": 6678, + "task_loss": 0.27015188336372375 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7596333622932434, + "epoch": 5.65, + "learning_rate": 2.418991265145111e-05, + "loss": 0.875, + "step": 6679, + "task_loss": 0.662571907043457 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.2702337503433228, + "epoch": 5.65, + "learning_rate": 2.41852164929088e-05, + "loss": 1.1335, + "step": 6680, + "task_loss": 1.0464636087417603 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6935229301452637, + "epoch": 5.65, + "learning_rate": 2.4180520334366488e-05, + "loss": 0.7341, + "step": 6681, + "task_loss": 0.10915953665971756 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4639397859573364, + "epoch": 5.65, + "learning_rate": 2.4175824175824177e-05, + "loss": 0.5897, + "step": 6682, + "task_loss": 0.2648518681526184 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5799527168273926, + "epoch": 5.65, + "learning_rate": 2.4171128017281864e-05, + "loss": 0.8275, + "step": 6683, + "task_loss": 0.5818865895271301 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.3729114532470703, + "epoch": 5.65, + "learning_rate": 2.4166431858739553e-05, + "loss": 0.9814, + "step": 6684, + "task_loss": 0.7873285412788391 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.1515474319458008, + "epoch": 5.65, + "learning_rate": 2.416173570019724e-05, + "loss": 0.9354, + "step": 6685, + "task_loss": 0.9992050528526306 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9275329113006592, + "epoch": 5.65, + "learning_rate": 2.4157039541654926e-05, + "loss": 0.9854, + "step": 6686, + "task_loss": 1.4290953874588013 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4899478852748871, + "epoch": 5.65, + "learning_rate": 2.4152343383112616e-05, + "loss": 0.6192, + "step": 6687, + "task_loss": 0.3014602065086365 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.1952579021453857, + "epoch": 5.65, + "learning_rate": 2.4147647224570302e-05, + "loss": 1.0752, + "step": 6688, + "task_loss": 1.1045126914978027 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.698963463306427, + "epoch": 5.65, + "learning_rate": 2.4142951066027992e-05, + "loss": 0.666, + "step": 6689, + "task_loss": 1.2605646848678589 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7402207851409912, + "epoch": 5.65, + "learning_rate": 2.413825490748568e-05, + "loss": 0.6763, + "step": 6690, + "task_loss": 1.1705387830734253 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9535470008850098, + "epoch": 5.66, + "learning_rate": 2.4133558748943365e-05, + "loss": 1.3241, + "step": 6691, + "task_loss": 1.0142521858215332 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.1447124481201172, + "epoch": 5.66, + "learning_rate": 2.412886259040105e-05, + "loss": 0.9807, + "step": 6692, + "task_loss": 1.9657164812088013 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0589673519134521, + "epoch": 5.66, + "learning_rate": 2.412416643185874e-05, + "loss": 1.0214, + "step": 6693, + "task_loss": 1.2299724817276 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9088494777679443, + "epoch": 5.66, + "learning_rate": 2.4119470273316427e-05, + "loss": 0.8565, + "step": 6694, + "task_loss": 0.5997495055198669 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.142212152481079, + "epoch": 5.66, + "learning_rate": 2.4114774114774117e-05, + "loss": 1.0995, + "step": 6695, + "task_loss": 1.0152781009674072 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.1859357357025146, + "epoch": 5.66, + "learning_rate": 2.4110077956231803e-05, + "loss": 1.0325, + "step": 6696, + "task_loss": 0.87417072057724 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9592379331588745, + "epoch": 5.66, + "learning_rate": 2.4105381797689493e-05, + "loss": 0.8754, + "step": 6697, + "task_loss": 0.7784700989723206 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9023374319076538, + "epoch": 5.66, + "learning_rate": 2.4100685639147176e-05, + "loss": 0.9607, + "step": 6698, + "task_loss": 0.25220412015914917 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7597769498825073, + "epoch": 5.66, + "learning_rate": 2.4095989480604866e-05, + "loss": 0.7456, + "step": 6699, + "task_loss": 1.1861598491668701 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.2931573390960693, + "epoch": 5.66, + "learning_rate": 2.4091293322062555e-05, + "loss": 0.9023, + "step": 6700, + "task_loss": 1.8444111347198486 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6457901000976562, + "epoch": 5.66, + "learning_rate": 2.408659716352024e-05, + "loss": 0.7163, + "step": 6701, + "task_loss": 0.7411329746246338 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.874161958694458, + "epoch": 5.66, + "learning_rate": 2.408190100497793e-05, + "loss": 0.7032, + "step": 6702, + "task_loss": 0.5570341348648071 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0381759405136108, + "epoch": 5.67, + "learning_rate": 2.4077204846435618e-05, + "loss": 0.8266, + "step": 6703, + "task_loss": 1.0522780418395996 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0730299949645996, + "epoch": 5.67, + "learning_rate": 2.4072508687893304e-05, + "loss": 0.9997, + "step": 6704, + "task_loss": 1.4148147106170654 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.285895824432373, + "epoch": 5.67, + "learning_rate": 2.406781252935099e-05, + "loss": 1.201, + "step": 6705, + "task_loss": 2.1042075157165527 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7462339401245117, + "epoch": 5.67, + "learning_rate": 2.406311637080868e-05, + "loss": 0.67, + "step": 6706, + "task_loss": 0.7871127128601074 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8379177451133728, + "epoch": 5.67, + "learning_rate": 2.4058420212266366e-05, + "loss": 0.7469, + "step": 6707, + "task_loss": 1.5637456178665161 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6633034944534302, + "epoch": 5.67, + "learning_rate": 2.4053724053724056e-05, + "loss": 0.8865, + "step": 6708, + "task_loss": 0.5655769109725952 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9467108249664307, + "epoch": 5.67, + "learning_rate": 2.4049027895181742e-05, + "loss": 0.9702, + "step": 6709, + "task_loss": 0.8540567755699158 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8440371751785278, + "epoch": 5.67, + "learning_rate": 2.404433173663943e-05, + "loss": 0.7798, + "step": 6710, + "task_loss": 0.8622531294822693 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9724523425102234, + "epoch": 5.67, + "learning_rate": 2.4039635578097115e-05, + "loss": 1.1839, + "step": 6711, + "task_loss": 0.9750789403915405 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7582234144210815, + "epoch": 5.67, + "learning_rate": 2.4034939419554805e-05, + "loss": 0.757, + "step": 6712, + "task_loss": 0.29026368260383606 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.48764902353286743, + "epoch": 5.67, + "learning_rate": 2.403024326101249e-05, + "loss": 0.8076, + "step": 6713, + "task_loss": 0.5419002175331116 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.2826939821243286, + "epoch": 5.67, + "learning_rate": 2.402554710247018e-05, + "loss": 0.9619, + "step": 6714, + "task_loss": 1.2972105741500854 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6389666795730591, + "epoch": 5.68, + "learning_rate": 2.402085094392787e-05, + "loss": 0.8879, + "step": 6715, + "task_loss": 0.6819944381713867 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7594606280326843, + "epoch": 5.68, + "learning_rate": 2.4016154785385557e-05, + "loss": 0.8273, + "step": 6716, + "task_loss": 0.23025716841220856 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.803584098815918, + "epoch": 5.68, + "learning_rate": 2.4011458626843243e-05, + "loss": 0.8802, + "step": 6717, + "task_loss": 0.6110880970954895 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8747059106826782, + "epoch": 5.68, + "learning_rate": 2.400676246830093e-05, + "loss": 0.7512, + "step": 6718, + "task_loss": 2.2715461254119873 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.910496711730957, + "epoch": 5.68, + "learning_rate": 2.400206630975862e-05, + "loss": 1.2916, + "step": 6719, + "task_loss": 0.9776650667190552 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.2206528186798096, + "epoch": 5.68, + "learning_rate": 2.3997370151216306e-05, + "loss": 0.8925, + "step": 6720, + "task_loss": 1.4328557252883911 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.4446865320205688, + "epoch": 5.68, + "learning_rate": 2.3992673992673995e-05, + "loss": 0.964, + "step": 6721, + "task_loss": 0.685194194316864 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8352596759796143, + "epoch": 5.68, + "learning_rate": 2.3987977834131682e-05, + "loss": 0.8212, + "step": 6722, + "task_loss": 0.4034329056739807 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.5897613763809204, + "epoch": 5.68, + "learning_rate": 2.3983281675589368e-05, + "loss": 1.0465, + "step": 6723, + "task_loss": 2.1436948776245117 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0675944089889526, + "epoch": 5.68, + "learning_rate": 2.3978585517047055e-05, + "loss": 1.0278, + "step": 6724, + "task_loss": 0.9114589691162109 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7282688617706299, + "epoch": 5.68, + "learning_rate": 2.3973889358504744e-05, + "loss": 0.9645, + "step": 6725, + "task_loss": 0.628159761428833 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9330127835273743, + "epoch": 5.69, + "learning_rate": 2.396919319996243e-05, + "loss": 0.8942, + "step": 6726, + "task_loss": 0.9269289970397949 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.4653854370117188, + "epoch": 5.69, + "learning_rate": 2.396449704142012e-05, + "loss": 1.269, + "step": 6727, + "task_loss": 1.2264471054077148 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.2153143882751465, + "epoch": 5.69, + "learning_rate": 2.3959800882877807e-05, + "loss": 0.9267, + "step": 6728, + "task_loss": 1.8929495811462402 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5065398216247559, + "epoch": 5.69, + "learning_rate": 2.3955104724335496e-05, + "loss": 0.8302, + "step": 6729, + "task_loss": 0.41992226243019104 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6887391209602356, + "epoch": 5.69, + "learning_rate": 2.3950408565793183e-05, + "loss": 0.6423, + "step": 6730, + "task_loss": 1.3406347036361694 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6049679517745972, + "epoch": 5.69, + "learning_rate": 2.394571240725087e-05, + "loss": 0.7939, + "step": 6731, + "task_loss": 1.6558846235275269 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8503549695014954, + "epoch": 5.69, + "learning_rate": 2.394101624870856e-05, + "loss": 1.1088, + "step": 6732, + "task_loss": 1.0449090003967285 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.720858097076416, + "epoch": 5.69, + "learning_rate": 2.3936320090166245e-05, + "loss": 0.7858, + "step": 6733, + "task_loss": 0.2571253478527069 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8401659727096558, + "epoch": 5.69, + "learning_rate": 2.3931623931623935e-05, + "loss": 0.8951, + "step": 6734, + "task_loss": 1.1766583919525146 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.918848991394043, + "epoch": 5.69, + "learning_rate": 2.392692777308162e-05, + "loss": 0.8965, + "step": 6735, + "task_loss": 0.5514716506004333 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.053659200668335, + "epoch": 5.69, + "learning_rate": 2.3922231614539308e-05, + "loss": 0.9142, + "step": 6736, + "task_loss": 1.1920145750045776 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0580832958221436, + "epoch": 5.69, + "learning_rate": 2.3917535455996994e-05, + "loss": 0.7459, + "step": 6737, + "task_loss": 0.7092434167861938 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0503007173538208, + "epoch": 5.7, + "learning_rate": 2.3912839297454684e-05, + "loss": 0.8874, + "step": 6738, + "task_loss": 1.0656776428222656 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0154904127120972, + "epoch": 5.7, + "learning_rate": 2.390814313891237e-05, + "loss": 0.9125, + "step": 6739, + "task_loss": 0.33276402950286865 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.805359959602356, + "epoch": 5.7, + "learning_rate": 2.390344698037006e-05, + "loss": 0.8591, + "step": 6740, + "task_loss": 0.34107446670532227 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7013375163078308, + "epoch": 5.7, + "learning_rate": 2.3898750821827746e-05, + "loss": 0.6753, + "step": 6741, + "task_loss": 1.3797718286514282 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9709469079971313, + "epoch": 5.7, + "learning_rate": 2.3894054663285432e-05, + "loss": 1.2308, + "step": 6742, + "task_loss": 1.1232008934020996 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8366460204124451, + "epoch": 5.7, + "learning_rate": 2.388935850474312e-05, + "loss": 0.9826, + "step": 6743, + "task_loss": 1.3507962226867676 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6966295838356018, + "epoch": 5.7, + "learning_rate": 2.388466234620081e-05, + "loss": 0.8546, + "step": 6744, + "task_loss": 1.15293550491333 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9821621179580688, + "epoch": 5.7, + "learning_rate": 2.3879966187658495e-05, + "loss": 0.9458, + "step": 6745, + "task_loss": 1.179903268814087 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.782833456993103, + "epoch": 5.7, + "learning_rate": 2.3875270029116184e-05, + "loss": 0.9911, + "step": 6746, + "task_loss": 1.1119225025177002 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.1905227899551392, + "epoch": 5.7, + "learning_rate": 2.3870573870573874e-05, + "loss": 1.0834, + "step": 6747, + "task_loss": 0.3836309611797333 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5821097493171692, + "epoch": 5.7, + "learning_rate": 2.386587771203156e-05, + "loss": 0.9759, + "step": 6748, + "task_loss": 0.610830545425415 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.991465151309967, + "epoch": 5.7, + "learning_rate": 2.3861181553489247e-05, + "loss": 0.7552, + "step": 6749, + "task_loss": 1.2862426042556763 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.753917932510376, + "epoch": 5.71, + "learning_rate": 2.3856485394946933e-05, + "loss": 0.6696, + "step": 6750, + "task_loss": 0.2988802194595337 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.49491411447525024, + "epoch": 5.71, + "learning_rate": 2.3851789236404623e-05, + "loss": 0.6805, + "step": 6751, + "task_loss": 0.892216682434082 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0529417991638184, + "epoch": 5.71, + "learning_rate": 2.384709307786231e-05, + "loss": 0.7675, + "step": 6752, + "task_loss": 0.973456621170044 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5247900485992432, + "epoch": 5.71, + "learning_rate": 2.384239691932e-05, + "loss": 0.8147, + "step": 6753, + "task_loss": 0.55158531665802 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9365881085395813, + "epoch": 5.71, + "learning_rate": 2.3837700760777685e-05, + "loss": 0.8407, + "step": 6754, + "task_loss": 2.148561477661133 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6395298838615417, + "epoch": 5.71, + "learning_rate": 2.3833004602235372e-05, + "loss": 0.9224, + "step": 6755, + "task_loss": 1.1203320026397705 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5307620763778687, + "epoch": 5.71, + "learning_rate": 2.3828308443693058e-05, + "loss": 0.829, + "step": 6756, + "task_loss": 0.4411768913269043 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.2598950862884521, + "epoch": 5.71, + "learning_rate": 2.3823612285150748e-05, + "loss": 0.9772, + "step": 6757, + "task_loss": 1.4861348867416382 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.612754225730896, + "epoch": 5.71, + "learning_rate": 2.3818916126608434e-05, + "loss": 0.767, + "step": 6758, + "task_loss": 0.7962609529495239 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.657630205154419, + "epoch": 5.71, + "learning_rate": 2.3814219968066124e-05, + "loss": 0.7489, + "step": 6759, + "task_loss": 0.8585364818572998 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9311295747756958, + "epoch": 5.71, + "learning_rate": 2.380952380952381e-05, + "loss": 0.8249, + "step": 6760, + "task_loss": 1.1434255838394165 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8541572690010071, + "epoch": 5.71, + "learning_rate": 2.38048276509815e-05, + "loss": 1.03, + "step": 6761, + "task_loss": 0.6732721328735352 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6410472393035889, + "epoch": 5.72, + "learning_rate": 2.3800131492439186e-05, + "loss": 0.7643, + "step": 6762, + "task_loss": 0.4496767520904541 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8636772632598877, + "epoch": 5.72, + "learning_rate": 2.3795435333896873e-05, + "loss": 0.7582, + "step": 6763, + "task_loss": 0.48120084404945374 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5913612246513367, + "epoch": 5.72, + "learning_rate": 2.3790739175354562e-05, + "loss": 0.9875, + "step": 6764, + "task_loss": 0.9333608150482178 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0523264408111572, + "epoch": 5.72, + "learning_rate": 2.378604301681225e-05, + "loss": 1.1798, + "step": 6765, + "task_loss": 1.005062460899353 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.055492639541626, + "epoch": 5.72, + "learning_rate": 2.378134685826994e-05, + "loss": 1.1007, + "step": 6766, + "task_loss": 1.5288063287734985 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0697102546691895, + "epoch": 5.72, + "learning_rate": 2.3776650699727625e-05, + "loss": 1.1148, + "step": 6767, + "task_loss": 0.9422531723976135 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9807431697845459, + "epoch": 5.72, + "learning_rate": 2.377195454118531e-05, + "loss": 0.9831, + "step": 6768, + "task_loss": 1.160199761390686 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.8231852054595947, + "epoch": 5.72, + "learning_rate": 2.3767258382642997e-05, + "loss": 1.1457, + "step": 6769, + "task_loss": 1.8833708763122559 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7812897562980652, + "epoch": 5.72, + "learning_rate": 2.3762562224100687e-05, + "loss": 0.8156, + "step": 6770, + "task_loss": 0.5552233457565308 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6236609816551208, + "epoch": 5.72, + "learning_rate": 2.3757866065558373e-05, + "loss": 0.9773, + "step": 6771, + "task_loss": 0.7118287682533264 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6649678945541382, + "epoch": 5.72, + "learning_rate": 2.3753169907016063e-05, + "loss": 0.8662, + "step": 6772, + "task_loss": 0.9807431697845459 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4607981741428375, + "epoch": 5.72, + "learning_rate": 2.374847374847375e-05, + "loss": 0.8819, + "step": 6773, + "task_loss": 1.773616075515747 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6728079319000244, + "epoch": 5.73, + "learning_rate": 2.3743777589931436e-05, + "loss": 0.7518, + "step": 6774, + "task_loss": 0.49786239862442017 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.098536491394043, + "epoch": 5.73, + "learning_rate": 2.3739081431389122e-05, + "loss": 0.9188, + "step": 6775, + "task_loss": 1.0983518362045288 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7141021490097046, + "epoch": 5.73, + "learning_rate": 2.3734385272846812e-05, + "loss": 0.7367, + "step": 6776, + "task_loss": 0.7454456090927124 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.47558853030204773, + "epoch": 5.73, + "learning_rate": 2.37296891143045e-05, + "loss": 0.4233, + "step": 6777, + "task_loss": 0.14606861770153046 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4589689075946808, + "epoch": 5.73, + "learning_rate": 2.3724992955762188e-05, + "loss": 0.637, + "step": 6778, + "task_loss": 0.22938290238380432 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5932426452636719, + "epoch": 5.73, + "learning_rate": 2.3720296797219878e-05, + "loss": 0.7512, + "step": 6779, + "task_loss": 1.210940957069397 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5168956518173218, + "epoch": 5.73, + "learning_rate": 2.3715600638677564e-05, + "loss": 0.7626, + "step": 6780, + "task_loss": 0.5314469933509827 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8436493873596191, + "epoch": 5.73, + "learning_rate": 2.371090448013525e-05, + "loss": 0.9492, + "step": 6781, + "task_loss": 0.9849116206169128 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7337509989738464, + "epoch": 5.73, + "learning_rate": 2.3706208321592937e-05, + "loss": 0.935, + "step": 6782, + "task_loss": 0.5635011196136475 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8422363996505737, + "epoch": 5.73, + "learning_rate": 2.3701512163050626e-05, + "loss": 0.8883, + "step": 6783, + "task_loss": 0.5890029072761536 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8110880255699158, + "epoch": 5.73, + "learning_rate": 2.3696816004508313e-05, + "loss": 0.8629, + "step": 6784, + "task_loss": 0.8420389890670776 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9593644142150879, + "epoch": 5.73, + "learning_rate": 2.3692119845966003e-05, + "loss": 0.8809, + "step": 6785, + "task_loss": 0.5771307945251465 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7364098429679871, + "epoch": 5.74, + "learning_rate": 2.368742368742369e-05, + "loss": 0.7436, + "step": 6786, + "task_loss": 1.1267242431640625 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5847415924072266, + "epoch": 5.74, + "learning_rate": 2.3682727528881375e-05, + "loss": 0.9291, + "step": 6787, + "task_loss": 0.6240774989128113 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9620779752731323, + "epoch": 5.74, + "learning_rate": 2.367803137033906e-05, + "loss": 0.885, + "step": 6788, + "task_loss": 1.0169920921325684 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5182778835296631, + "epoch": 5.74, + "learning_rate": 2.367333521179675e-05, + "loss": 0.7864, + "step": 6789, + "task_loss": 0.4089384973049164 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.773504912853241, + "epoch": 5.74, + "learning_rate": 2.3668639053254438e-05, + "loss": 0.7423, + "step": 6790, + "task_loss": 0.7762398719787598 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.699677050113678, + "epoch": 5.74, + "learning_rate": 2.3663942894712127e-05, + "loss": 0.9109, + "step": 6791, + "task_loss": 1.1919825077056885 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0993223190307617, + "epoch": 5.74, + "learning_rate": 2.3659246736169817e-05, + "loss": 1.2199, + "step": 6792, + "task_loss": 1.123986005783081 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.4528881311416626, + "epoch": 5.74, + "learning_rate": 2.36545505776275e-05, + "loss": 0.897, + "step": 6793, + "task_loss": 2.1795127391815186 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.1702656745910645, + "epoch": 5.74, + "learning_rate": 2.364985441908519e-05, + "loss": 0.9933, + "step": 6794, + "task_loss": 0.4359816908836365 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.756558895111084, + "epoch": 5.74, + "learning_rate": 2.3645158260542876e-05, + "loss": 0.8728, + "step": 6795, + "task_loss": 1.057492971420288 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0622364282608032, + "epoch": 5.74, + "learning_rate": 2.3640462102000566e-05, + "loss": 0.9923, + "step": 6796, + "task_loss": 0.888116180896759 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0495953559875488, + "epoch": 5.75, + "learning_rate": 2.3635765943458252e-05, + "loss": 0.9764, + "step": 6797, + "task_loss": 0.8294357061386108 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9675319790840149, + "epoch": 5.75, + "learning_rate": 2.3631069784915942e-05, + "loss": 0.8055, + "step": 6798, + "task_loss": 0.4885249435901642 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.3587136268615723, + "epoch": 5.75, + "learning_rate": 2.3626373626373628e-05, + "loss": 1.1005, + "step": 6799, + "task_loss": 0.7753101587295532 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.37446045875549316, + "epoch": 5.75, + "learning_rate": 2.3621677467831315e-05, + "loss": 0.6066, + "step": 6800, + "task_loss": 0.49950113892555237 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.6154265403747559, + "epoch": 5.75, + "learning_rate": 2.3616981309289e-05, + "loss": 1.1312, + "step": 6801, + "task_loss": 1.5039540529251099 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.6585464477539062, + "epoch": 5.75, + "learning_rate": 2.361228515074669e-05, + "loss": 0.9463, + "step": 6802, + "task_loss": 1.507667899131775 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.2881836891174316, + "epoch": 5.75, + "learning_rate": 2.3607588992204377e-05, + "loss": 0.9524, + "step": 6803, + "task_loss": 1.1745299100875854 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.6048994064331055, + "epoch": 5.75, + "learning_rate": 2.3602892833662067e-05, + "loss": 0.957, + "step": 6804, + "task_loss": 0.684712827205658 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7645369172096252, + "epoch": 5.75, + "learning_rate": 2.3598196675119753e-05, + "loss": 1.0451, + "step": 6805, + "task_loss": 0.9328283071517944 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8263797163963318, + "epoch": 5.75, + "learning_rate": 2.359350051657744e-05, + "loss": 0.9066, + "step": 6806, + "task_loss": 1.1664538383483887 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5030562877655029, + "epoch": 5.75, + "learning_rate": 2.358880435803513e-05, + "loss": 0.7478, + "step": 6807, + "task_loss": 0.46425676345825195 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.8725855350494385, + "epoch": 5.75, + "learning_rate": 2.3584108199492815e-05, + "loss": 1.2244, + "step": 6808, + "task_loss": 0.9999521374702454 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6280463933944702, + "epoch": 5.76, + "learning_rate": 2.3579412040950505e-05, + "loss": 0.7202, + "step": 6809, + "task_loss": 1.6380839347839355 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.1800111532211304, + "epoch": 5.76, + "learning_rate": 2.357471588240819e-05, + "loss": 0.829, + "step": 6810, + "task_loss": 0.26093339920043945 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6319969892501831, + "epoch": 5.76, + "learning_rate": 2.357001972386588e-05, + "loss": 0.781, + "step": 6811, + "task_loss": 1.0988578796386719 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6074014902114868, + "epoch": 5.76, + "learning_rate": 2.3565323565323568e-05, + "loss": 0.7322, + "step": 6812, + "task_loss": 1.25213623046875 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5137218832969666, + "epoch": 5.76, + "learning_rate": 2.3560627406781254e-05, + "loss": 0.8427, + "step": 6813, + "task_loss": 0.7499380707740784 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5483342409133911, + "epoch": 5.76, + "learning_rate": 2.355593124823894e-05, + "loss": 0.7722, + "step": 6814, + "task_loss": 0.4077723026275635 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9410679340362549, + "epoch": 5.76, + "learning_rate": 2.355123508969663e-05, + "loss": 0.7725, + "step": 6815, + "task_loss": 0.9299442172050476 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6100887060165405, + "epoch": 5.76, + "learning_rate": 2.3546538931154316e-05, + "loss": 0.7016, + "step": 6816, + "task_loss": 0.9085521101951599 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.216246485710144, + "epoch": 5.76, + "learning_rate": 2.3541842772612006e-05, + "loss": 0.9952, + "step": 6817, + "task_loss": 1.888864278793335 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.212327480316162, + "epoch": 5.76, + "learning_rate": 2.3537146614069692e-05, + "loss": 0.9393, + "step": 6818, + "task_loss": 1.422967791557312 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9767873883247375, + "epoch": 5.76, + "learning_rate": 2.353245045552738e-05, + "loss": 0.9512, + "step": 6819, + "task_loss": 1.1837917566299438 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.3333511352539062, + "epoch": 5.76, + "learning_rate": 2.3527754296985065e-05, + "loss": 1.0178, + "step": 6820, + "task_loss": 1.4801969528198242 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3590152859687805, + "epoch": 5.77, + "learning_rate": 2.3523058138442755e-05, + "loss": 0.7483, + "step": 6821, + "task_loss": 0.2580345571041107 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4334285855293274, + "epoch": 5.77, + "learning_rate": 2.351836197990044e-05, + "loss": 0.9249, + "step": 6822, + "task_loss": 0.2827914357185364 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8378063440322876, + "epoch": 5.77, + "learning_rate": 2.351366582135813e-05, + "loss": 1.2055, + "step": 6823, + "task_loss": 0.8619141578674316 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9888297319412231, + "epoch": 5.77, + "learning_rate": 2.350896966281582e-05, + "loss": 0.8761, + "step": 6824, + "task_loss": 0.5391133427619934 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0114572048187256, + "epoch": 5.77, + "learning_rate": 2.3504273504273504e-05, + "loss": 0.8212, + "step": 6825, + "task_loss": 1.0523393154144287 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9178014397621155, + "epoch": 5.77, + "learning_rate": 2.3499577345731193e-05, + "loss": 0.9809, + "step": 6826, + "task_loss": 1.1906602382659912 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.1491262912750244, + "epoch": 5.77, + "learning_rate": 2.349488118718888e-05, + "loss": 0.9491, + "step": 6827, + "task_loss": 1.092125654220581 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0226373672485352, + "epoch": 5.77, + "learning_rate": 2.349018502864657e-05, + "loss": 0.9059, + "step": 6828, + "task_loss": 0.7144634127616882 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7359093427658081, + "epoch": 5.77, + "learning_rate": 2.3485488870104256e-05, + "loss": 0.9656, + "step": 6829, + "task_loss": 0.8184313774108887 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.3813645839691162, + "epoch": 5.77, + "learning_rate": 2.3480792711561945e-05, + "loss": 1.061, + "step": 6830, + "task_loss": 0.4989403486251831 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7664267420768738, + "epoch": 5.77, + "learning_rate": 2.3476096553019632e-05, + "loss": 0.7546, + "step": 6831, + "task_loss": 0.4185173213481903 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.3168002367019653, + "epoch": 5.77, + "learning_rate": 2.3471400394477318e-05, + "loss": 0.8221, + "step": 6832, + "task_loss": 1.364028811454773 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6354924440383911, + "epoch": 5.78, + "learning_rate": 2.3466704235935004e-05, + "loss": 1.1353, + "step": 6833, + "task_loss": 1.573555827140808 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.4167433977127075, + "epoch": 5.78, + "learning_rate": 2.3462008077392694e-05, + "loss": 1.0049, + "step": 6834, + "task_loss": 1.2953133583068848 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8627262115478516, + "epoch": 5.78, + "learning_rate": 2.345731191885038e-05, + "loss": 0.9682, + "step": 6835, + "task_loss": 1.256417989730835 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6282433271408081, + "epoch": 5.78, + "learning_rate": 2.345261576030807e-05, + "loss": 0.9248, + "step": 6836, + "task_loss": 0.7942405939102173 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9341217279434204, + "epoch": 5.78, + "learning_rate": 2.3447919601765757e-05, + "loss": 0.8284, + "step": 6837, + "task_loss": 1.022289514541626 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6974766254425049, + "epoch": 5.78, + "learning_rate": 2.3443223443223443e-05, + "loss": 0.9916, + "step": 6838, + "task_loss": 0.4251866638660431 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.794503927230835, + "epoch": 5.78, + "learning_rate": 2.3438527284681133e-05, + "loss": 0.6981, + "step": 6839, + "task_loss": 1.6146925687789917 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.5364608764648438, + "epoch": 5.78, + "learning_rate": 2.343383112613882e-05, + "loss": 1.3108, + "step": 6840, + "task_loss": 1.0463383197784424 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.790054202079773, + "epoch": 5.78, + "learning_rate": 2.342913496759651e-05, + "loss": 0.7953, + "step": 6841, + "task_loss": 0.5601029396057129 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.45132768154144287, + "epoch": 5.78, + "learning_rate": 2.3424438809054195e-05, + "loss": 0.6887, + "step": 6842, + "task_loss": 1.015944242477417 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.4502911567687988, + "epoch": 5.78, + "learning_rate": 2.3419742650511885e-05, + "loss": 1.1734, + "step": 6843, + "task_loss": 0.7939892411231995 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.400557518005371, + "epoch": 5.78, + "learning_rate": 2.341504649196957e-05, + "loss": 0.9336, + "step": 6844, + "task_loss": 1.5174509286880493 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7225519418716431, + "epoch": 5.79, + "learning_rate": 2.3410350333427257e-05, + "loss": 0.9157, + "step": 6845, + "task_loss": 0.35504353046417236 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9297246336936951, + "epoch": 5.79, + "learning_rate": 2.3405654174884944e-05, + "loss": 0.8967, + "step": 6846, + "task_loss": 0.3101593255996704 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.46276307106018066, + "epoch": 5.79, + "learning_rate": 2.3400958016342634e-05, + "loss": 0.8509, + "step": 6847, + "task_loss": 0.7028865218162537 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5728351473808289, + "epoch": 5.79, + "learning_rate": 2.339626185780032e-05, + "loss": 0.7822, + "step": 6848, + "task_loss": 0.909103274345398 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7553730010986328, + "epoch": 5.79, + "learning_rate": 2.339156569925801e-05, + "loss": 0.6899, + "step": 6849, + "task_loss": 1.172672152519226 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.802759051322937, + "epoch": 5.79, + "learning_rate": 2.3386869540715696e-05, + "loss": 0.793, + "step": 6850, + "task_loss": 0.621513843536377 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7995235323905945, + "epoch": 5.79, + "learning_rate": 2.3382173382173382e-05, + "loss": 0.8495, + "step": 6851, + "task_loss": 1.2424572706222534 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8351410627365112, + "epoch": 5.79, + "learning_rate": 2.337747722363107e-05, + "loss": 1.0225, + "step": 6852, + "task_loss": 0.7646808624267578 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8923661708831787, + "epoch": 5.79, + "learning_rate": 2.337278106508876e-05, + "loss": 1.0041, + "step": 6853, + "task_loss": 1.52045476436615 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8104562759399414, + "epoch": 5.79, + "learning_rate": 2.3368084906546448e-05, + "loss": 0.9341, + "step": 6854, + "task_loss": 0.9462757706642151 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7566956281661987, + "epoch": 5.79, + "learning_rate": 2.3363388748004134e-05, + "loss": 0.6267, + "step": 6855, + "task_loss": 0.8107412457466125 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8587989807128906, + "epoch": 5.79, + "learning_rate": 2.3358692589461824e-05, + "loss": 0.8467, + "step": 6856, + "task_loss": 0.7658548951148987 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6538493037223816, + "epoch": 5.8, + "learning_rate": 2.3353996430919507e-05, + "loss": 0.758, + "step": 6857, + "task_loss": 1.0989794731140137 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7427317500114441, + "epoch": 5.8, + "learning_rate": 2.3349300272377197e-05, + "loss": 0.8188, + "step": 6858, + "task_loss": 0.7333974242210388 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.442085325717926, + "epoch": 5.8, + "learning_rate": 2.3344604113834883e-05, + "loss": 0.6523, + "step": 6859, + "task_loss": 0.8253942131996155 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8534146547317505, + "epoch": 5.8, + "learning_rate": 2.3339907955292573e-05, + "loss": 0.7179, + "step": 6860, + "task_loss": 1.7735497951507568 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.843344509601593, + "epoch": 5.8, + "learning_rate": 2.333521179675026e-05, + "loss": 0.7314, + "step": 6861, + "task_loss": 0.6097232699394226 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7665247321128845, + "epoch": 5.8, + "learning_rate": 2.333051563820795e-05, + "loss": 0.8157, + "step": 6862, + "task_loss": 1.3952053785324097 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6893059611320496, + "epoch": 5.8, + "learning_rate": 2.3325819479665635e-05, + "loss": 0.7709, + "step": 6863, + "task_loss": 0.6930266618728638 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8679988384246826, + "epoch": 5.8, + "learning_rate": 2.332112332112332e-05, + "loss": 0.7166, + "step": 6864, + "task_loss": 1.130811095237732 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8586546182632446, + "epoch": 5.8, + "learning_rate": 2.3316427162581008e-05, + "loss": 0.927, + "step": 6865, + "task_loss": 1.284659504890442 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6050385236740112, + "epoch": 5.8, + "learning_rate": 2.3311731004038698e-05, + "loss": 0.8173, + "step": 6866, + "task_loss": 0.7318169474601746 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5167334675788879, + "epoch": 5.8, + "learning_rate": 2.3307034845496384e-05, + "loss": 0.8118, + "step": 6867, + "task_loss": 0.256539523601532 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5412167310714722, + "epoch": 5.81, + "learning_rate": 2.3302338686954074e-05, + "loss": 0.7207, + "step": 6868, + "task_loss": 0.5017980933189392 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7587090730667114, + "epoch": 5.81, + "learning_rate": 2.329764252841176e-05, + "loss": 0.8699, + "step": 6869, + "task_loss": 1.1228703260421753 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.4404726028442383, + "epoch": 5.81, + "learning_rate": 2.3292946369869446e-05, + "loss": 0.8959, + "step": 6870, + "task_loss": 1.045318365097046 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.686195969581604, + "epoch": 5.81, + "learning_rate": 2.3288250211327136e-05, + "loss": 0.8004, + "step": 6871, + "task_loss": 1.61308753490448 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8011695146560669, + "epoch": 5.81, + "learning_rate": 2.3283554052784823e-05, + "loss": 0.7128, + "step": 6872, + "task_loss": 1.1039596796035767 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.2391188144683838, + "epoch": 5.81, + "learning_rate": 2.3278857894242512e-05, + "loss": 0.9571, + "step": 6873, + "task_loss": 0.7356581687927246 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6594183444976807, + "epoch": 5.81, + "learning_rate": 2.32741617357002e-05, + "loss": 0.9206, + "step": 6874, + "task_loss": 0.14400354027748108 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7619233727455139, + "epoch": 5.81, + "learning_rate": 2.326946557715789e-05, + "loss": 0.9263, + "step": 6875, + "task_loss": 0.6382659077644348 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.1585818529129028, + "epoch": 5.81, + "learning_rate": 2.326476941861557e-05, + "loss": 0.8938, + "step": 6876, + "task_loss": 0.783816933631897 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6864654421806335, + "epoch": 5.81, + "learning_rate": 2.326007326007326e-05, + "loss": 0.9365, + "step": 6877, + "task_loss": 0.9610595107078552 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6200267672538757, + "epoch": 5.81, + "learning_rate": 2.3255377101530947e-05, + "loss": 0.8179, + "step": 6878, + "task_loss": 0.4812156558036804 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.1050633192062378, + "epoch": 5.81, + "learning_rate": 2.3250680942988637e-05, + "loss": 1.4977, + "step": 6879, + "task_loss": 1.0830085277557373 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8092394471168518, + "epoch": 5.82, + "learning_rate": 2.3245984784446323e-05, + "loss": 0.9317, + "step": 6880, + "task_loss": 0.405548095703125 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8841133713722229, + "epoch": 5.82, + "learning_rate": 2.3241288625904013e-05, + "loss": 0.9087, + "step": 6881, + "task_loss": 1.1318182945251465 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.007925033569336, + "epoch": 5.82, + "learning_rate": 2.32365924673617e-05, + "loss": 0.9969, + "step": 6882, + "task_loss": 1.354547381401062 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7686929106712341, + "epoch": 5.82, + "learning_rate": 2.3231896308819386e-05, + "loss": 0.7524, + "step": 6883, + "task_loss": 0.5223163366317749 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9769868850708008, + "epoch": 5.82, + "learning_rate": 2.3227200150277076e-05, + "loss": 0.6363, + "step": 6884, + "task_loss": 1.4073272943496704 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8222090005874634, + "epoch": 5.82, + "learning_rate": 2.3222503991734762e-05, + "loss": 0.7308, + "step": 6885, + "task_loss": 0.21077287197113037 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6233669519424438, + "epoch": 5.82, + "learning_rate": 2.321780783319245e-05, + "loss": 0.7602, + "step": 6886, + "task_loss": 0.2762865722179413 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.3236417770385742, + "epoch": 5.82, + "learning_rate": 2.3213111674650138e-05, + "loss": 0.9728, + "step": 6887, + "task_loss": 0.9814698696136475 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5462210178375244, + "epoch": 5.82, + "learning_rate": 2.3208415516107824e-05, + "loss": 0.5563, + "step": 6888, + "task_loss": 0.37993213534355164 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0004702806472778, + "epoch": 5.82, + "learning_rate": 2.320371935756551e-05, + "loss": 0.8634, + "step": 6889, + "task_loss": 0.5098750591278076 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.1138341426849365, + "epoch": 5.82, + "learning_rate": 2.31990231990232e-05, + "loss": 0.8875, + "step": 6890, + "task_loss": 1.359745979309082 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6399946212768555, + "epoch": 5.82, + "learning_rate": 2.3194327040480887e-05, + "loss": 0.8642, + "step": 6891, + "task_loss": 0.4864305853843689 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8141530156135559, + "epoch": 5.83, + "learning_rate": 2.3189630881938576e-05, + "loss": 0.7493, + "step": 6892, + "task_loss": 0.2087172418832779 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0358107089996338, + "epoch": 5.83, + "learning_rate": 2.3184934723396263e-05, + "loss": 0.9869, + "step": 6893, + "task_loss": 0.7740412354469299 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4288710951805115, + "epoch": 5.83, + "learning_rate": 2.3180238564853952e-05, + "loss": 0.7663, + "step": 6894, + "task_loss": 1.085727334022522 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5555062294006348, + "epoch": 5.83, + "learning_rate": 2.317554240631164e-05, + "loss": 0.6862, + "step": 6895, + "task_loss": 0.3498213291168213 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8729539513587952, + "epoch": 5.83, + "learning_rate": 2.3170846247769325e-05, + "loss": 1.1404, + "step": 6896, + "task_loss": 1.4903252124786377 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.754063606262207, + "epoch": 5.83, + "learning_rate": 2.316615008922701e-05, + "loss": 0.7722, + "step": 6897, + "task_loss": 1.448262095451355 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.3116471767425537, + "epoch": 5.83, + "learning_rate": 2.31614539306847e-05, + "loss": 0.978, + "step": 6898, + "task_loss": 0.3612949550151825 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.36440032720565796, + "epoch": 5.83, + "learning_rate": 2.3156757772142388e-05, + "loss": 0.6397, + "step": 6899, + "task_loss": 0.4426412880420685 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.3543827533721924, + "epoch": 5.83, + "learning_rate": 2.3152061613600077e-05, + "loss": 0.846, + "step": 6900, + "task_loss": 1.5030003786087036 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7986416220664978, + "epoch": 5.83, + "learning_rate": 2.3147365455057764e-05, + "loss": 0.7467, + "step": 6901, + "task_loss": 1.2942898273468018 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0500165224075317, + "epoch": 5.83, + "learning_rate": 2.314266929651545e-05, + "loss": 0.979, + "step": 6902, + "task_loss": 0.6424713730812073 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5613407492637634, + "epoch": 5.83, + "learning_rate": 2.313797313797314e-05, + "loss": 0.8224, + "step": 6903, + "task_loss": 0.665805995464325 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5366293787956238, + "epoch": 5.84, + "learning_rate": 2.3133276979430826e-05, + "loss": 0.8141, + "step": 6904, + "task_loss": 1.5167591571807861 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5052111148834229, + "epoch": 5.84, + "learning_rate": 2.3128580820888516e-05, + "loss": 0.6873, + "step": 6905, + "task_loss": 1.0663334131240845 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7718274593353271, + "epoch": 5.84, + "learning_rate": 2.3123884662346202e-05, + "loss": 0.6698, + "step": 6906, + "task_loss": 0.8493602871894836 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.1375393867492676, + "epoch": 5.84, + "learning_rate": 2.3119188503803892e-05, + "loss": 1.0143, + "step": 6907, + "task_loss": 0.6042356491088867 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8886361122131348, + "epoch": 5.84, + "learning_rate": 2.3114492345261575e-05, + "loss": 0.875, + "step": 6908, + "task_loss": 1.5783445835113525 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7825520038604736, + "epoch": 5.84, + "learning_rate": 2.3109796186719265e-05, + "loss": 0.8119, + "step": 6909, + "task_loss": 0.7991464138031006 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9669448137283325, + "epoch": 5.84, + "learning_rate": 2.310510002817695e-05, + "loss": 0.8918, + "step": 6910, + "task_loss": 1.1243325471878052 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9332430958747864, + "epoch": 5.84, + "learning_rate": 2.310040386963464e-05, + "loss": 0.7433, + "step": 6911, + "task_loss": 0.773442804813385 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7388519644737244, + "epoch": 5.84, + "learning_rate": 2.3095707711092327e-05, + "loss": 0.7363, + "step": 6912, + "task_loss": 0.5961018800735474 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9556235074996948, + "epoch": 5.84, + "learning_rate": 2.3091011552550017e-05, + "loss": 0.7352, + "step": 6913, + "task_loss": 1.1430150270462036 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0854153633117676, + "epoch": 5.84, + "learning_rate": 2.3086315394007703e-05, + "loss": 0.859, + "step": 6914, + "task_loss": 1.4004640579223633 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6225603818893433, + "epoch": 5.84, + "learning_rate": 2.308161923546539e-05, + "loss": 0.7295, + "step": 6915, + "task_loss": 0.7262918949127197 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9057717323303223, + "epoch": 5.85, + "learning_rate": 2.307692307692308e-05, + "loss": 0.8642, + "step": 6916, + "task_loss": 0.670978844165802 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.881833016872406, + "epoch": 5.85, + "learning_rate": 2.3072226918380765e-05, + "loss": 0.7499, + "step": 6917, + "task_loss": 0.6133262515068054 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8066921830177307, + "epoch": 5.85, + "learning_rate": 2.3067530759838455e-05, + "loss": 0.8389, + "step": 6918, + "task_loss": 1.5226454734802246 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0238455533981323, + "epoch": 5.85, + "learning_rate": 2.306283460129614e-05, + "loss": 0.8864, + "step": 6919, + "task_loss": 0.8819732069969177 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.1692633628845215, + "epoch": 5.85, + "learning_rate": 2.3058138442753828e-05, + "loss": 0.9533, + "step": 6920, + "task_loss": 1.7323793172836304 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8660577535629272, + "epoch": 5.85, + "learning_rate": 2.3053442284211514e-05, + "loss": 0.7999, + "step": 6921, + "task_loss": 0.6514772772789001 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6388611793518066, + "epoch": 5.85, + "learning_rate": 2.3048746125669204e-05, + "loss": 0.7827, + "step": 6922, + "task_loss": 0.610383927822113 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9167336225509644, + "epoch": 5.85, + "learning_rate": 2.304404996712689e-05, + "loss": 0.7704, + "step": 6923, + "task_loss": 0.6501918435096741 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5288246870040894, + "epoch": 5.85, + "learning_rate": 2.303935380858458e-05, + "loss": 0.8486, + "step": 6924, + "task_loss": 0.47539663314819336 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5187451839447021, + "epoch": 5.85, + "learning_rate": 2.3034657650042266e-05, + "loss": 0.6627, + "step": 6925, + "task_loss": 0.30219000577926636 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6163839101791382, + "epoch": 5.85, + "learning_rate": 2.3029961491499956e-05, + "loss": 0.9476, + "step": 6926, + "task_loss": 0.6096378564834595 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0600165128707886, + "epoch": 5.85, + "learning_rate": 2.302526533295764e-05, + "loss": 0.9849, + "step": 6927, + "task_loss": 0.7635249495506287 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0966877937316895, + "epoch": 5.86, + "learning_rate": 2.302056917441533e-05, + "loss": 0.9959, + "step": 6928, + "task_loss": 0.8137834072113037 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8586665391921997, + "epoch": 5.86, + "learning_rate": 2.3015873015873015e-05, + "loss": 1.0013, + "step": 6929, + "task_loss": 0.980431079864502 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5253503918647766, + "epoch": 5.86, + "learning_rate": 2.3011176857330705e-05, + "loss": 0.9951, + "step": 6930, + "task_loss": 1.20150887966156 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6390697956085205, + "epoch": 5.86, + "learning_rate": 2.3006480698788394e-05, + "loss": 0.7622, + "step": 6931, + "task_loss": 1.7462464570999146 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.3124889135360718, + "epoch": 5.86, + "learning_rate": 2.300178454024608e-05, + "loss": 0.984, + "step": 6932, + "task_loss": 1.8247573375701904 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3460902273654938, + "epoch": 5.86, + "learning_rate": 2.2997088381703767e-05, + "loss": 0.5183, + "step": 6933, + "task_loss": 0.6855204701423645 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8788442015647888, + "epoch": 5.86, + "learning_rate": 2.2992392223161454e-05, + "loss": 0.7912, + "step": 6934, + "task_loss": 1.0054244995117188 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8220540881156921, + "epoch": 5.86, + "learning_rate": 2.2987696064619143e-05, + "loss": 0.6779, + "step": 6935, + "task_loss": 1.8817849159240723 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.4200230836868286, + "epoch": 5.86, + "learning_rate": 2.298299990607683e-05, + "loss": 1.1267, + "step": 6936, + "task_loss": 0.8536819815635681 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7493687868118286, + "epoch": 5.86, + "learning_rate": 2.297830374753452e-05, + "loss": 0.8365, + "step": 6937, + "task_loss": 0.6214563250541687 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7632887363433838, + "epoch": 5.86, + "learning_rate": 2.2973607588992206e-05, + "loss": 0.8551, + "step": 6938, + "task_loss": 0.12873922288417816 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9851712584495544, + "epoch": 5.87, + "learning_rate": 2.2968911430449895e-05, + "loss": 0.94, + "step": 6939, + "task_loss": 0.7460455298423767 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7070729732513428, + "epoch": 5.87, + "learning_rate": 2.296421527190758e-05, + "loss": 0.7423, + "step": 6940, + "task_loss": 0.782521665096283 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.3316031694412231, + "epoch": 5.87, + "learning_rate": 2.2959519113365268e-05, + "loss": 1.0013, + "step": 6941, + "task_loss": 0.610973596572876 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8394127488136292, + "epoch": 5.87, + "learning_rate": 2.2954822954822954e-05, + "loss": 0.7113, + "step": 6942, + "task_loss": 0.39955854415893555 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8137984275817871, + "epoch": 5.87, + "learning_rate": 2.2950126796280644e-05, + "loss": 0.9763, + "step": 6943, + "task_loss": 1.2447491884231567 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8746053576469421, + "epoch": 5.87, + "learning_rate": 2.294543063773833e-05, + "loss": 0.8439, + "step": 6944, + "task_loss": 0.381888747215271 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7498089671134949, + "epoch": 5.87, + "learning_rate": 2.294073447919602e-05, + "loss": 0.8196, + "step": 6945, + "task_loss": 1.0139275789260864 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6087511777877808, + "epoch": 5.87, + "learning_rate": 2.2936038320653707e-05, + "loss": 0.7155, + "step": 6946, + "task_loss": 0.14695659279823303 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7072761058807373, + "epoch": 5.87, + "learning_rate": 2.2931342162111393e-05, + "loss": 0.9015, + "step": 6947, + "task_loss": 0.6744276285171509 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.137140154838562, + "epoch": 5.87, + "learning_rate": 2.2926646003569083e-05, + "loss": 0.8566, + "step": 6948, + "task_loss": 1.8221889734268188 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.45189398527145386, + "epoch": 5.87, + "learning_rate": 2.292194984502677e-05, + "loss": 0.6702, + "step": 6949, + "task_loss": 0.5211673974990845 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6708605289459229, + "epoch": 5.87, + "learning_rate": 2.291725368648446e-05, + "loss": 0.9045, + "step": 6950, + "task_loss": 0.5179941654205322 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9778361320495605, + "epoch": 5.88, + "learning_rate": 2.2912557527942145e-05, + "loss": 1.005, + "step": 6951, + "task_loss": 1.316859245300293 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8110535144805908, + "epoch": 5.88, + "learning_rate": 2.290786136939983e-05, + "loss": 0.8943, + "step": 6952, + "task_loss": 0.9041056632995605 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8837547898292542, + "epoch": 5.88, + "learning_rate": 2.2903165210857518e-05, + "loss": 0.7391, + "step": 6953, + "task_loss": 1.5919286012649536 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7427752017974854, + "epoch": 5.88, + "learning_rate": 2.2898469052315207e-05, + "loss": 0.7419, + "step": 6954, + "task_loss": 0.2517172396183014 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9190841913223267, + "epoch": 5.88, + "learning_rate": 2.2893772893772894e-05, + "loss": 0.6895, + "step": 6955, + "task_loss": 0.6325318217277527 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8149527311325073, + "epoch": 5.88, + "learning_rate": 2.2889076735230583e-05, + "loss": 1.0495, + "step": 6956, + "task_loss": 1.2683438062667847 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.2799642086029053, + "epoch": 5.88, + "learning_rate": 2.288438057668827e-05, + "loss": 1.1165, + "step": 6957, + "task_loss": 1.1726783514022827 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.586638867855072, + "epoch": 5.88, + "learning_rate": 2.287968441814596e-05, + "loss": 0.979, + "step": 6958, + "task_loss": 0.6093842387199402 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.122387409210205, + "epoch": 5.88, + "learning_rate": 2.2874988259603642e-05, + "loss": 0.9169, + "step": 6959, + "task_loss": 0.9772908687591553 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8854989409446716, + "epoch": 5.88, + "learning_rate": 2.2870292101061332e-05, + "loss": 0.7951, + "step": 6960, + "task_loss": 0.8777256608009338 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.874219536781311, + "epoch": 5.88, + "learning_rate": 2.286559594251902e-05, + "loss": 0.8117, + "step": 6961, + "task_loss": 0.7840709090232849 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8511516451835632, + "epoch": 5.88, + "learning_rate": 2.2860899783976708e-05, + "loss": 0.8148, + "step": 6962, + "task_loss": 0.33824560046195984 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6783667206764221, + "epoch": 5.89, + "learning_rate": 2.2856203625434398e-05, + "loss": 0.778, + "step": 6963, + "task_loss": 1.1324853897094727 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5567435622215271, + "epoch": 5.89, + "learning_rate": 2.2851507466892084e-05, + "loss": 0.5621, + "step": 6964, + "task_loss": 0.49745652079582214 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8053522109985352, + "epoch": 5.89, + "learning_rate": 2.284681130834977e-05, + "loss": 1.0254, + "step": 6965, + "task_loss": 0.5220735669136047 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3973289132118225, + "epoch": 5.89, + "learning_rate": 2.2842115149807457e-05, + "loss": 0.9998, + "step": 6966, + "task_loss": 0.6074799299240112 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9571936130523682, + "epoch": 5.89, + "learning_rate": 2.2837418991265147e-05, + "loss": 0.9284, + "step": 6967, + "task_loss": 0.9362512826919556 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3906312882900238, + "epoch": 5.89, + "learning_rate": 2.2832722832722833e-05, + "loss": 0.7246, + "step": 6968, + "task_loss": 0.7789096832275391 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.057280421257019, + "epoch": 5.89, + "learning_rate": 2.2828026674180523e-05, + "loss": 1.2006, + "step": 6969, + "task_loss": 1.5188665390014648 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5446982383728027, + "epoch": 5.89, + "learning_rate": 2.282333051563821e-05, + "loss": 0.7341, + "step": 6970, + "task_loss": 0.2737044394016266 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0223560333251953, + "epoch": 5.89, + "learning_rate": 2.2818634357095896e-05, + "loss": 0.8467, + "step": 6971, + "task_loss": 0.8495752215385437 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.728813111782074, + "epoch": 5.89, + "learning_rate": 2.2813938198553582e-05, + "loss": 0.8262, + "step": 6972, + "task_loss": 0.8715047836303711 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7089194655418396, + "epoch": 5.89, + "learning_rate": 2.280924204001127e-05, + "loss": 0.6547, + "step": 6973, + "task_loss": 1.1652207374572754 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0218008756637573, + "epoch": 5.89, + "learning_rate": 2.2804545881468958e-05, + "loss": 0.8641, + "step": 6974, + "task_loss": 1.442299485206604 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8833197951316833, + "epoch": 5.9, + "learning_rate": 2.2799849722926648e-05, + "loss": 0.7985, + "step": 6975, + "task_loss": 1.2060737609863281 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6676265597343445, + "epoch": 5.9, + "learning_rate": 2.2795153564384334e-05, + "loss": 0.8664, + "step": 6976, + "task_loss": 0.7294397354125977 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7121256589889526, + "epoch": 5.9, + "learning_rate": 2.2790457405842024e-05, + "loss": 0.9169, + "step": 6977, + "task_loss": 0.4984844923019409 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.73046875, + "epoch": 5.9, + "learning_rate": 2.278576124729971e-05, + "loss": 1.1071, + "step": 6978, + "task_loss": 1.3300596475601196 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6954641342163086, + "epoch": 5.9, + "learning_rate": 2.2781065088757396e-05, + "loss": 0.867, + "step": 6979, + "task_loss": 0.41550078988075256 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0882480144500732, + "epoch": 5.9, + "learning_rate": 2.2776368930215086e-05, + "loss": 1.027, + "step": 6980, + "task_loss": 1.1319646835327148 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9222243428230286, + "epoch": 5.9, + "learning_rate": 2.2771672771672772e-05, + "loss": 0.8067, + "step": 6981, + "task_loss": 1.798500657081604 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5047727823257446, + "epoch": 5.9, + "learning_rate": 2.2766976613130462e-05, + "loss": 0.6644, + "step": 6982, + "task_loss": 0.4798748791217804 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7138422727584839, + "epoch": 5.9, + "learning_rate": 2.276228045458815e-05, + "loss": 1.0872, + "step": 6983, + "task_loss": 1.0998626947402954 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6942945718765259, + "epoch": 5.9, + "learning_rate": 2.2757584296045835e-05, + "loss": 0.8228, + "step": 6984, + "task_loss": 1.0934557914733887 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9101066589355469, + "epoch": 5.9, + "learning_rate": 2.275288813750352e-05, + "loss": 0.7848, + "step": 6985, + "task_loss": 0.5921005010604858 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5204304456710815, + "epoch": 5.9, + "learning_rate": 2.274819197896121e-05, + "loss": 0.6916, + "step": 6986, + "task_loss": 1.2465870380401611 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9243582487106323, + "epoch": 5.91, + "learning_rate": 2.2743495820418897e-05, + "loss": 0.8488, + "step": 6987, + "task_loss": 1.042914867401123 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7971748113632202, + "epoch": 5.91, + "learning_rate": 2.2738799661876587e-05, + "loss": 0.7783, + "step": 6988, + "task_loss": 0.983962893486023 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7548261880874634, + "epoch": 5.91, + "learning_rate": 2.2734103503334273e-05, + "loss": 0.7227, + "step": 6989, + "task_loss": 0.329174667596817 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8483241200447083, + "epoch": 5.91, + "learning_rate": 2.2729407344791963e-05, + "loss": 0.9124, + "step": 6990, + "task_loss": 2.1100800037384033 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9797222018241882, + "epoch": 5.91, + "learning_rate": 2.2724711186249646e-05, + "loss": 0.8651, + "step": 6991, + "task_loss": 1.416909098625183 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7715015411376953, + "epoch": 5.91, + "learning_rate": 2.2720015027707336e-05, + "loss": 0.9823, + "step": 6992, + "task_loss": 1.918108344078064 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7222669720649719, + "epoch": 5.91, + "learning_rate": 2.2715318869165025e-05, + "loss": 0.8228, + "step": 6993, + "task_loss": 0.8216475248336792 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.2220818996429443, + "epoch": 5.91, + "learning_rate": 2.2710622710622712e-05, + "loss": 1.1323, + "step": 6994, + "task_loss": 1.4086518287658691 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0232889652252197, + "epoch": 5.91, + "learning_rate": 2.27059265520804e-05, + "loss": 0.8367, + "step": 6995, + "task_loss": 0.6310581564903259 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.8945904970169067, + "epoch": 5.91, + "learning_rate": 2.2701230393538088e-05, + "loss": 1.0898, + "step": 6996, + "task_loss": 0.9773446321487427 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8529220223426819, + "epoch": 5.91, + "learning_rate": 2.2696534234995774e-05, + "loss": 0.8647, + "step": 6997, + "task_loss": 0.6171911358833313 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.1637630462646484, + "epoch": 5.91, + "learning_rate": 2.269183807645346e-05, + "loss": 0.8855, + "step": 6998, + "task_loss": 1.7974998950958252 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.1789970397949219, + "epoch": 5.92, + "learning_rate": 2.268714191791115e-05, + "loss": 0.8982, + "step": 6999, + "task_loss": 1.2158973217010498 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.683456301689148, + "epoch": 5.92, + "learning_rate": 2.2682445759368837e-05, + "loss": 0.7951, + "step": 7000, + "task_loss": 0.6065704822540283 + }, + { + "epoch": 5.92, + "eval_accuracy": 0.8838415841584158, + "eval_loss": 0.5150585770606995, + "eval_runtime": 224.5706, + "eval_samples_per_second": 112.437, + "eval_steps_per_second": 0.882, + "step": 7000 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9588413238525391, + "epoch": 5.92, + "learning_rate": 2.2677749600826526e-05, + "loss": 0.7176, + "step": 7001, + "task_loss": 1.2816747426986694 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.3691787719726562, + "epoch": 5.92, + "learning_rate": 2.2673053442284213e-05, + "loss": 0.8769, + "step": 7002, + "task_loss": 1.5047285556793213 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.800286054611206, + "epoch": 5.92, + "learning_rate": 2.26683572837419e-05, + "loss": 0.7655, + "step": 7003, + "task_loss": 0.6300666928291321 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.3300762176513672, + "epoch": 5.92, + "learning_rate": 2.2663661125199585e-05, + "loss": 0.9512, + "step": 7004, + "task_loss": 1.9579286575317383 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8971841335296631, + "epoch": 5.92, + "learning_rate": 2.2658964966657275e-05, + "loss": 0.9581, + "step": 7005, + "task_loss": 0.2554986774921417 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6385400295257568, + "epoch": 5.92, + "learning_rate": 2.265426880811496e-05, + "loss": 0.8054, + "step": 7006, + "task_loss": 0.2977989614009857 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6598544120788574, + "epoch": 5.92, + "learning_rate": 2.264957264957265e-05, + "loss": 0.7568, + "step": 7007, + "task_loss": 0.6481936573982239 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.788580596446991, + "epoch": 5.92, + "learning_rate": 2.264487649103034e-05, + "loss": 0.7236, + "step": 7008, + "task_loss": 1.1272695064544678 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9879619479179382, + "epoch": 5.92, + "learning_rate": 2.2640180332488027e-05, + "loss": 0.8172, + "step": 7009, + "task_loss": 1.2341886758804321 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9914407134056091, + "epoch": 5.93, + "learning_rate": 2.2635484173945714e-05, + "loss": 0.7638, + "step": 7010, + "task_loss": 1.3693681955337524 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9357978105545044, + "epoch": 5.93, + "learning_rate": 2.26307880154034e-05, + "loss": 0.6903, + "step": 7011, + "task_loss": 1.2228977680206299 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0779452323913574, + "epoch": 5.93, + "learning_rate": 2.262609185686109e-05, + "loss": 0.8405, + "step": 7012, + "task_loss": 0.5646799206733704 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.4261345863342285, + "epoch": 5.93, + "learning_rate": 2.2621395698318776e-05, + "loss": 0.9865, + "step": 7013, + "task_loss": 1.2686917781829834 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8649443984031677, + "epoch": 5.93, + "learning_rate": 2.2616699539776466e-05, + "loss": 0.8531, + "step": 7014, + "task_loss": 0.6419537663459778 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9455982446670532, + "epoch": 5.93, + "learning_rate": 2.2612003381234152e-05, + "loss": 1.1514, + "step": 7015, + "task_loss": 1.1854740381240845 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8388601541519165, + "epoch": 5.93, + "learning_rate": 2.260730722269184e-05, + "loss": 0.8679, + "step": 7016, + "task_loss": 1.5066797733306885 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.03610098361969, + "epoch": 5.93, + "learning_rate": 2.2602611064149525e-05, + "loss": 0.7421, + "step": 7017, + "task_loss": 1.0045922994613647 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8432217240333557, + "epoch": 5.93, + "learning_rate": 2.2597914905607214e-05, + "loss": 0.8514, + "step": 7018, + "task_loss": 0.3781205117702484 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.47558140754699707, + "epoch": 5.93, + "learning_rate": 2.25932187470649e-05, + "loss": 0.6619, + "step": 7019, + "task_loss": 0.15561926364898682 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7415368556976318, + "epoch": 5.93, + "learning_rate": 2.258852258852259e-05, + "loss": 0.7535, + "step": 7020, + "task_loss": 0.9931349754333496 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9540878534317017, + "epoch": 5.93, + "learning_rate": 2.2583826429980277e-05, + "loss": 1.0486, + "step": 7021, + "task_loss": 1.3202139139175415 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7641855478286743, + "epoch": 5.94, + "learning_rate": 2.2579130271437963e-05, + "loss": 0.7483, + "step": 7022, + "task_loss": 0.9984277486801147 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.765364408493042, + "epoch": 5.94, + "learning_rate": 2.2574434112895653e-05, + "loss": 0.9977, + "step": 7023, + "task_loss": 1.194465160369873 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3132104277610779, + "epoch": 5.94, + "learning_rate": 2.256973795435334e-05, + "loss": 0.5363, + "step": 7024, + "task_loss": 0.4265490174293518 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9456419944763184, + "epoch": 5.94, + "learning_rate": 2.256504179581103e-05, + "loss": 0.9498, + "step": 7025, + "task_loss": 0.6996541619300842 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.217101812362671, + "epoch": 5.94, + "learning_rate": 2.2560345637268715e-05, + "loss": 0.8973, + "step": 7026, + "task_loss": 0.5073807239532471 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.022989273071289, + "epoch": 5.94, + "learning_rate": 2.2555649478726405e-05, + "loss": 0.903, + "step": 7027, + "task_loss": 1.0329394340515137 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4081493318080902, + "epoch": 5.94, + "learning_rate": 2.255095332018409e-05, + "loss": 0.7252, + "step": 7028, + "task_loss": 0.6235049366950989 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7852829694747925, + "epoch": 5.94, + "learning_rate": 2.2546257161641778e-05, + "loss": 0.7015, + "step": 7029, + "task_loss": 1.5458372831344604 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3363259434700012, + "epoch": 5.94, + "learning_rate": 2.2541561003099464e-05, + "loss": 0.6224, + "step": 7030, + "task_loss": 0.22201451659202576 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8587237000465393, + "epoch": 5.94, + "learning_rate": 2.2536864844557154e-05, + "loss": 0.7754, + "step": 7031, + "task_loss": 1.8809257745742798 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9103831052780151, + "epoch": 5.94, + "learning_rate": 2.253216868601484e-05, + "loss": 0.9604, + "step": 7032, + "task_loss": 1.038590431213379 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7720468044281006, + "epoch": 5.94, + "learning_rate": 2.252747252747253e-05, + "loss": 0.8512, + "step": 7033, + "task_loss": 1.3782955408096313 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.1836183071136475, + "epoch": 5.95, + "learning_rate": 2.2522776368930216e-05, + "loss": 0.7274, + "step": 7034, + "task_loss": 1.0465004444122314 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5925596952438354, + "epoch": 5.95, + "learning_rate": 2.2518080210387903e-05, + "loss": 0.7852, + "step": 7035, + "task_loss": 0.697490930557251 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5514025688171387, + "epoch": 5.95, + "learning_rate": 2.251338405184559e-05, + "loss": 0.6209, + "step": 7036, + "task_loss": 0.5737578868865967 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.017447590827942, + "epoch": 5.95, + "learning_rate": 2.250868789330328e-05, + "loss": 1.0084, + "step": 7037, + "task_loss": 1.4770710468292236 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7246628999710083, + "epoch": 5.95, + "learning_rate": 2.2503991734760965e-05, + "loss": 0.6628, + "step": 7038, + "task_loss": 1.2673052549362183 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8188577890396118, + "epoch": 5.95, + "learning_rate": 2.2499295576218655e-05, + "loss": 0.8567, + "step": 7039, + "task_loss": 2.572810649871826 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5957992076873779, + "epoch": 5.95, + "learning_rate": 2.2494599417676344e-05, + "loss": 0.8747, + "step": 7040, + "task_loss": 0.18199940025806427 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.70896977186203, + "epoch": 5.95, + "learning_rate": 2.248990325913403e-05, + "loss": 0.6841, + "step": 7041, + "task_loss": 0.46921682357788086 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8619024753570557, + "epoch": 5.95, + "learning_rate": 2.2485207100591717e-05, + "loss": 0.8581, + "step": 7042, + "task_loss": 0.3523638844490051 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.750055193901062, + "epoch": 5.95, + "learning_rate": 2.2480510942049403e-05, + "loss": 0.9406, + "step": 7043, + "task_loss": 1.5775014162063599 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8647449612617493, + "epoch": 5.95, + "learning_rate": 2.2475814783507093e-05, + "loss": 0.9011, + "step": 7044, + "task_loss": 1.0744491815567017 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5677909851074219, + "epoch": 5.95, + "learning_rate": 2.247111862496478e-05, + "loss": 0.7126, + "step": 7045, + "task_loss": 1.059313178062439 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6422876715660095, + "epoch": 5.96, + "learning_rate": 2.246642246642247e-05, + "loss": 0.7657, + "step": 7046, + "task_loss": 0.8289316296577454 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6676558256149292, + "epoch": 5.96, + "learning_rate": 2.2461726307880156e-05, + "loss": 0.8593, + "step": 7047, + "task_loss": 0.2909344434738159 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.48784780502319336, + "epoch": 5.96, + "learning_rate": 2.2457030149337842e-05, + "loss": 0.6151, + "step": 7048, + "task_loss": 0.9122986197471619 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9367936849594116, + "epoch": 5.96, + "learning_rate": 2.2452333990795528e-05, + "loss": 0.8298, + "step": 7049, + "task_loss": 0.8969089388847351 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.1982269287109375, + "epoch": 5.96, + "learning_rate": 2.2447637832253218e-05, + "loss": 0.9218, + "step": 7050, + "task_loss": 1.847790002822876 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5890570878982544, + "epoch": 5.96, + "learning_rate": 2.2442941673710904e-05, + "loss": 0.5573, + "step": 7051, + "task_loss": 0.5556719303131104 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.41060635447502136, + "epoch": 5.96, + "learning_rate": 2.2438245515168594e-05, + "loss": 0.8207, + "step": 7052, + "task_loss": 0.3406837582588196 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.935823380947113, + "epoch": 5.96, + "learning_rate": 2.243354935662628e-05, + "loss": 0.8276, + "step": 7053, + "task_loss": 0.6874507665634155 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.297155499458313, + "epoch": 5.96, + "learning_rate": 2.2428853198083967e-05, + "loss": 1.1128, + "step": 7054, + "task_loss": 1.421630620956421 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5736539959907532, + "epoch": 5.96, + "learning_rate": 2.2424157039541656e-05, + "loss": 0.9507, + "step": 7055, + "task_loss": 0.4416114091873169 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.46343064308166504, + "epoch": 5.96, + "learning_rate": 2.2419460880999343e-05, + "loss": 0.9748, + "step": 7056, + "task_loss": 0.12594757974147797 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.36130693554878235, + "epoch": 5.96, + "learning_rate": 2.2414764722457033e-05, + "loss": 1.0106, + "step": 7057, + "task_loss": 0.5704684257507324 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7424352765083313, + "epoch": 5.97, + "learning_rate": 2.241006856391472e-05, + "loss": 1.0201, + "step": 7058, + "task_loss": 1.4417438507080078 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7421086430549622, + "epoch": 5.97, + "learning_rate": 2.240537240537241e-05, + "loss": 0.9251, + "step": 7059, + "task_loss": 0.9259775280952454 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5388233661651611, + "epoch": 5.97, + "learning_rate": 2.2400676246830095e-05, + "loss": 0.7348, + "step": 7060, + "task_loss": 1.284247875213623 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.365987777709961, + "epoch": 5.97, + "learning_rate": 2.239598008828778e-05, + "loss": 0.9939, + "step": 7061, + "task_loss": 1.0275120735168457 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8863250613212585, + "epoch": 5.97, + "learning_rate": 2.2391283929745468e-05, + "loss": 1.0017, + "step": 7062, + "task_loss": 0.7532990574836731 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.35293763875961304, + "epoch": 5.97, + "learning_rate": 2.2386587771203157e-05, + "loss": 0.7771, + "step": 7063, + "task_loss": 0.5053237676620483 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7880846261978149, + "epoch": 5.97, + "learning_rate": 2.2381891612660844e-05, + "loss": 0.6985, + "step": 7064, + "task_loss": 0.7875204086303711 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7210752964019775, + "epoch": 5.97, + "learning_rate": 2.2377195454118533e-05, + "loss": 0.8243, + "step": 7065, + "task_loss": 1.0389220714569092 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7914570569992065, + "epoch": 5.97, + "learning_rate": 2.237249929557622e-05, + "loss": 0.7726, + "step": 7066, + "task_loss": 0.4207613468170166 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7843008041381836, + "epoch": 5.97, + "learning_rate": 2.2367803137033906e-05, + "loss": 0.7116, + "step": 7067, + "task_loss": 1.1446281671524048 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0661332607269287, + "epoch": 5.97, + "learning_rate": 2.2363106978491592e-05, + "loss": 0.7027, + "step": 7068, + "task_loss": 0.8612820506095886 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5790665149688721, + "epoch": 5.97, + "learning_rate": 2.2358410819949282e-05, + "loss": 0.7596, + "step": 7069, + "task_loss": 0.9179668426513672 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6164804697036743, + "epoch": 5.98, + "learning_rate": 2.2353714661406972e-05, + "loss": 0.9001, + "step": 7070, + "task_loss": 0.6243817806243896 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.1339552402496338, + "epoch": 5.98, + "learning_rate": 2.2349018502864658e-05, + "loss": 1.1157, + "step": 7071, + "task_loss": 1.8071458339691162 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7230014204978943, + "epoch": 5.98, + "learning_rate": 2.2344322344322348e-05, + "loss": 0.8089, + "step": 7072, + "task_loss": 1.2446247339248657 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.1915570497512817, + "epoch": 5.98, + "learning_rate": 2.2339626185780034e-05, + "loss": 1.0401, + "step": 7073, + "task_loss": 0.9752329587936401 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6162565350532532, + "epoch": 5.98, + "learning_rate": 2.233493002723772e-05, + "loss": 0.8273, + "step": 7074, + "task_loss": 0.37752223014831543 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6434027552604675, + "epoch": 5.98, + "learning_rate": 2.2330233868695407e-05, + "loss": 0.9642, + "step": 7075, + "task_loss": 0.39203447103500366 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6678682565689087, + "epoch": 5.98, + "learning_rate": 2.2325537710153097e-05, + "loss": 0.6932, + "step": 7076, + "task_loss": 0.5934561491012573 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.611844539642334, + "epoch": 5.98, + "learning_rate": 2.2320841551610783e-05, + "loss": 0.8508, + "step": 7077, + "task_loss": 0.23391200602054596 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6665818095207214, + "epoch": 5.98, + "learning_rate": 2.2316145393068473e-05, + "loss": 0.7439, + "step": 7078, + "task_loss": 0.30145263671875 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9503761529922485, + "epoch": 5.98, + "learning_rate": 2.231144923452616e-05, + "loss": 0.8294, + "step": 7079, + "task_loss": 2.1329708099365234 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5071483254432678, + "epoch": 5.98, + "learning_rate": 2.2306753075983845e-05, + "loss": 0.7459, + "step": 7080, + "task_loss": 0.8000216484069824 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.1570560932159424, + "epoch": 5.99, + "learning_rate": 2.2302056917441532e-05, + "loss": 1.1422, + "step": 7081, + "task_loss": 0.6803814172744751 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5520585775375366, + "epoch": 5.99, + "learning_rate": 2.229736075889922e-05, + "loss": 0.8093, + "step": 7082, + "task_loss": 0.8136605620384216 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.8256561756134033, + "epoch": 5.99, + "learning_rate": 2.2292664600356908e-05, + "loss": 1.0303, + "step": 7083, + "task_loss": 0.8956756591796875 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8932708501815796, + "epoch": 5.99, + "learning_rate": 2.2287968441814598e-05, + "loss": 1.0735, + "step": 7084, + "task_loss": 0.9514458179473877 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5048891305923462, + "epoch": 5.99, + "learning_rate": 2.2283272283272287e-05, + "loss": 0.6773, + "step": 7085, + "task_loss": 1.1843836307525635 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7672263979911804, + "epoch": 5.99, + "learning_rate": 2.227857612472997e-05, + "loss": 1.0116, + "step": 7086, + "task_loss": 0.047299567610025406 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.845800518989563, + "epoch": 5.99, + "learning_rate": 2.227387996618766e-05, + "loss": 0.8056, + "step": 7087, + "task_loss": 0.9710116386413574 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5241060853004456, + "epoch": 5.99, + "learning_rate": 2.2269183807645346e-05, + "loss": 0.6622, + "step": 7088, + "task_loss": 0.5125356912612915 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.3953444957733154, + "epoch": 5.99, + "learning_rate": 2.2264487649103036e-05, + "loss": 0.8504, + "step": 7089, + "task_loss": 1.9851305484771729 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6369837522506714, + "epoch": 5.99, + "learning_rate": 2.2259791490560722e-05, + "loss": 0.6087, + "step": 7090, + "task_loss": 0.2604276239871979 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7294655442237854, + "epoch": 5.99, + "learning_rate": 2.2255095332018412e-05, + "loss": 0.7701, + "step": 7091, + "task_loss": 0.835907518863678 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6402999758720398, + "epoch": 5.99, + "learning_rate": 2.22503991734761e-05, + "loss": 0.6667, + "step": 7092, + "task_loss": 1.5920175313949585 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7890723943710327, + "epoch": 6.0, + "learning_rate": 2.2245703014933785e-05, + "loss": 0.948, + "step": 7093, + "task_loss": 0.46619632840156555 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9958089590072632, + "epoch": 6.0, + "learning_rate": 2.224100685639147e-05, + "loss": 0.7747, + "step": 7094, + "task_loss": 1.7218952178955078 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7369644045829773, + "epoch": 6.0, + "learning_rate": 2.223631069784916e-05, + "loss": 1.0508, + "step": 7095, + "task_loss": 0.6656375527381897 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6014696955680847, + "epoch": 6.0, + "learning_rate": 2.2231614539306847e-05, + "loss": 0.8388, + "step": 7096, + "task_loss": 1.1683316230773926 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5023189187049866, + "epoch": 6.0, + "learning_rate": 2.2226918380764537e-05, + "loss": 0.8474, + "step": 7097, + "task_loss": 0.46635836362838745 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.3413389921188354, + "epoch": 6.0, + "learning_rate": 2.2222222222222223e-05, + "loss": 1.1693, + "step": 7098, + "task_loss": 2.1103053092956543 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.38380709290504456, + "epoch": 6.0, + "learning_rate": 2.221752606367991e-05, + "loss": 1.0348, + "step": 7099, + "task_loss": 0.45267486572265625 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8650040626525879, + "epoch": 6.0, + "learning_rate": 2.22128299051376e-05, + "loss": 0.7859, + "step": 7100, + "task_loss": 0.8456686735153198 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7220553159713745, + "epoch": 6.0, + "learning_rate": 2.2208133746595286e-05, + "loss": 0.6328, + "step": 7101, + "task_loss": 0.88136225938797 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.42263901233673096, + "epoch": 6.0, + "learning_rate": 2.2203437588052975e-05, + "loss": 0.7556, + "step": 7102, + "task_loss": 0.16944269835948944 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7597610950469971, + "epoch": 6.0, + "learning_rate": 2.2198741429510662e-05, + "loss": 0.7617, + "step": 7103, + "task_loss": 0.6251004934310913 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7827243804931641, + "epoch": 6.01, + "learning_rate": 2.219404527096835e-05, + "loss": 0.9542, + "step": 7104, + "task_loss": 0.7290997505187988 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6937300562858582, + "epoch": 6.01, + "learning_rate": 2.2189349112426034e-05, + "loss": 0.753, + "step": 7105, + "task_loss": 0.5165429711341858 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5440090298652649, + "epoch": 6.01, + "learning_rate": 2.2184652953883724e-05, + "loss": 0.69, + "step": 7106, + "task_loss": 0.9731672406196594 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.1508285999298096, + "epoch": 6.01, + "learning_rate": 2.217995679534141e-05, + "loss": 0.8466, + "step": 7107, + "task_loss": 1.384342908859253 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.1506946086883545, + "epoch": 6.01, + "learning_rate": 2.21752606367991e-05, + "loss": 0.7503, + "step": 7108, + "task_loss": 1.3345965147018433 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6869202852249146, + "epoch": 6.01, + "learning_rate": 2.2170564478256787e-05, + "loss": 0.6771, + "step": 7109, + "task_loss": 0.7129122018814087 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9743735790252686, + "epoch": 6.01, + "learning_rate": 2.2165868319714476e-05, + "loss": 0.7487, + "step": 7110, + "task_loss": 0.19290515780448914 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6354200839996338, + "epoch": 6.01, + "learning_rate": 2.2161172161172163e-05, + "loss": 1.0086, + "step": 7111, + "task_loss": 0.7354514002799988 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.529369056224823, + "epoch": 6.01, + "learning_rate": 2.215647600262985e-05, + "loss": 0.8747, + "step": 7112, + "task_loss": 0.7682474851608276 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9894292950630188, + "epoch": 6.01, + "learning_rate": 2.2151779844087535e-05, + "loss": 0.877, + "step": 7113, + "task_loss": 1.9297856092453003 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5690426826477051, + "epoch": 6.01, + "learning_rate": 2.2147083685545225e-05, + "loss": 0.5735, + "step": 7114, + "task_loss": 0.2082011103630066 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9326813220977783, + "epoch": 6.01, + "learning_rate": 2.214238752700291e-05, + "loss": 0.805, + "step": 7115, + "task_loss": 0.8731945753097534 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6090649962425232, + "epoch": 6.02, + "learning_rate": 2.21376913684606e-05, + "loss": 0.8732, + "step": 7116, + "task_loss": 0.18333499133586884 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7906862497329712, + "epoch": 6.02, + "learning_rate": 2.2132995209918287e-05, + "loss": 0.5838, + "step": 7117, + "task_loss": 0.6922846436500549 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5143712759017944, + "epoch": 6.02, + "learning_rate": 2.2128299051375974e-05, + "loss": 0.8429, + "step": 7118, + "task_loss": 0.5707194209098816 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0517569780349731, + "epoch": 6.02, + "learning_rate": 2.2123602892833664e-05, + "loss": 0.7796, + "step": 7119, + "task_loss": 1.7226910591125488 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5328706502914429, + "epoch": 6.02, + "learning_rate": 2.211890673429135e-05, + "loss": 0.7218, + "step": 7120, + "task_loss": 1.0554251670837402 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9534875154495239, + "epoch": 6.02, + "learning_rate": 2.211421057574904e-05, + "loss": 0.8273, + "step": 7121, + "task_loss": 0.7329445481300354 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7075216770172119, + "epoch": 6.02, + "learning_rate": 2.2109514417206726e-05, + "loss": 0.7766, + "step": 7122, + "task_loss": 0.44506293535232544 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9938985109329224, + "epoch": 6.02, + "learning_rate": 2.2104818258664416e-05, + "loss": 0.8157, + "step": 7123, + "task_loss": 0.37984177470207214 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8548861742019653, + "epoch": 6.02, + "learning_rate": 2.2100122100122102e-05, + "loss": 0.8224, + "step": 7124, + "task_loss": 0.6260621547698975 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.015764594078064, + "epoch": 6.02, + "learning_rate": 2.209542594157979e-05, + "loss": 0.8965, + "step": 7125, + "task_loss": 1.584453821182251 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8498431444168091, + "epoch": 6.02, + "learning_rate": 2.2090729783037475e-05, + "loss": 0.6671, + "step": 7126, + "task_loss": 1.1376436948776245 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8616166710853577, + "epoch": 6.02, + "learning_rate": 2.2086033624495164e-05, + "loss": 0.8606, + "step": 7127, + "task_loss": 0.3699338734149933 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.1592398881912231, + "epoch": 6.03, + "learning_rate": 2.208133746595285e-05, + "loss": 0.9588, + "step": 7128, + "task_loss": 0.8832799196243286 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.620702862739563, + "epoch": 6.03, + "learning_rate": 2.207664130741054e-05, + "loss": 0.9765, + "step": 7129, + "task_loss": 0.9461565017700195 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4700899124145508, + "epoch": 6.03, + "learning_rate": 2.2071945148868227e-05, + "loss": 0.6986, + "step": 7130, + "task_loss": 0.6697307229042053 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.2398372888565063, + "epoch": 6.03, + "learning_rate": 2.2067248990325913e-05, + "loss": 1.098, + "step": 7131, + "task_loss": 0.8978198766708374 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8548671007156372, + "epoch": 6.03, + "learning_rate": 2.2062552831783603e-05, + "loss": 0.6506, + "step": 7132, + "task_loss": 0.2794896960258484 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5915980339050293, + "epoch": 6.03, + "learning_rate": 2.205785667324129e-05, + "loss": 0.9059, + "step": 7133, + "task_loss": 0.43156757950782776 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9717321991920471, + "epoch": 6.03, + "learning_rate": 2.205316051469898e-05, + "loss": 0.7042, + "step": 7134, + "task_loss": 0.6891641020774841 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.312893271446228, + "epoch": 6.03, + "learning_rate": 2.2048464356156665e-05, + "loss": 0.8802, + "step": 7135, + "task_loss": 1.6514629125595093 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6723796129226685, + "epoch": 6.03, + "learning_rate": 2.2043768197614355e-05, + "loss": 0.7898, + "step": 7136, + "task_loss": 0.09357137233018875 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.125716209411621, + "epoch": 6.03, + "learning_rate": 2.2039072039072038e-05, + "loss": 0.9075, + "step": 7137, + "task_loss": 0.8880801200866699 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8031906485557556, + "epoch": 6.03, + "learning_rate": 2.2034375880529728e-05, + "loss": 0.8184, + "step": 7138, + "task_loss": 1.023810863494873 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3047631084918976, + "epoch": 6.03, + "learning_rate": 2.2029679721987414e-05, + "loss": 0.512, + "step": 7139, + "task_loss": 0.7965165972709656 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.1894936561584473, + "epoch": 6.04, + "learning_rate": 2.2024983563445104e-05, + "loss": 0.822, + "step": 7140, + "task_loss": 0.5055736303329468 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7172307968139648, + "epoch": 6.04, + "learning_rate": 2.202028740490279e-05, + "loss": 1.0584, + "step": 7141, + "task_loss": 0.575607419013977 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6257330179214478, + "epoch": 6.04, + "learning_rate": 2.201559124636048e-05, + "loss": 0.8242, + "step": 7142, + "task_loss": 1.00485098361969 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8556972742080688, + "epoch": 6.04, + "learning_rate": 2.2010895087818166e-05, + "loss": 0.7311, + "step": 7143, + "task_loss": 0.41729289293289185 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5250390768051147, + "epoch": 6.04, + "learning_rate": 2.2006198929275853e-05, + "loss": 0.6189, + "step": 7144, + "task_loss": 0.4488837718963623 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8957421183586121, + "epoch": 6.04, + "learning_rate": 2.200150277073354e-05, + "loss": 0.8399, + "step": 7145, + "task_loss": 1.7091084718704224 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7141124606132507, + "epoch": 6.04, + "learning_rate": 2.199680661219123e-05, + "loss": 0.9786, + "step": 7146, + "task_loss": 0.5973165035247803 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4809889793395996, + "epoch": 6.04, + "learning_rate": 2.199211045364892e-05, + "loss": 0.8487, + "step": 7147, + "task_loss": 0.7864792346954346 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.1007254123687744, + "epoch": 6.04, + "learning_rate": 2.1987414295106605e-05, + "loss": 0.8665, + "step": 7148, + "task_loss": 1.084396481513977 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.1862244606018066, + "epoch": 6.04, + "learning_rate": 2.198271813656429e-05, + "loss": 0.7627, + "step": 7149, + "task_loss": 0.9052022695541382 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8797260522842407, + "epoch": 6.04, + "learning_rate": 2.1978021978021977e-05, + "loss": 1.012, + "step": 7150, + "task_loss": 0.8473811745643616 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5255821943283081, + "epoch": 6.04, + "learning_rate": 2.1973325819479667e-05, + "loss": 0.5598, + "step": 7151, + "task_loss": 0.6623478531837463 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7809185981750488, + "epoch": 6.05, + "learning_rate": 2.1968629660937353e-05, + "loss": 0.6905, + "step": 7152, + "task_loss": 0.8511772155761719 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5125645995140076, + "epoch": 6.05, + "learning_rate": 2.1963933502395043e-05, + "loss": 0.7509, + "step": 7153, + "task_loss": 0.4534519612789154 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0601935386657715, + "epoch": 6.05, + "learning_rate": 2.195923734385273e-05, + "loss": 0.896, + "step": 7154, + "task_loss": 0.8400830626487732 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9168382883071899, + "epoch": 6.05, + "learning_rate": 2.195454118531042e-05, + "loss": 0.8192, + "step": 7155, + "task_loss": 0.9539080262184143 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9142945408821106, + "epoch": 6.05, + "learning_rate": 2.1949845026768106e-05, + "loss": 0.9187, + "step": 7156, + "task_loss": 0.9168631434440613 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4489479660987854, + "epoch": 6.05, + "learning_rate": 2.1945148868225792e-05, + "loss": 0.8247, + "step": 7157, + "task_loss": 0.18643520772457123 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5891568660736084, + "epoch": 6.05, + "learning_rate": 2.1940452709683478e-05, + "loss": 0.7695, + "step": 7158, + "task_loss": 0.2961452007293701 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5328854322433472, + "epoch": 6.05, + "learning_rate": 2.1935756551141168e-05, + "loss": 0.6078, + "step": 7159, + "task_loss": 0.37295442819595337 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7369132041931152, + "epoch": 6.05, + "learning_rate": 2.1931060392598854e-05, + "loss": 0.6495, + "step": 7160, + "task_loss": 0.8477520942687988 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5799646377563477, + "epoch": 6.05, + "learning_rate": 2.1926364234056544e-05, + "loss": 0.8054, + "step": 7161, + "task_loss": 1.1240569353103638 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5039221048355103, + "epoch": 6.05, + "learning_rate": 2.192166807551423e-05, + "loss": 0.6328, + "step": 7162, + "task_loss": 0.596105694770813 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9567481875419617, + "epoch": 6.05, + "learning_rate": 2.1916971916971917e-05, + "loss": 0.655, + "step": 7163, + "task_loss": 0.6072888374328613 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.606728196144104, + "epoch": 6.06, + "learning_rate": 2.1912275758429606e-05, + "loss": 0.5885, + "step": 7164, + "task_loss": 0.3200775980949402 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.734768271446228, + "epoch": 6.06, + "learning_rate": 2.1907579599887293e-05, + "loss": 0.7925, + "step": 7165, + "task_loss": 0.40596508979797363 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4207037091255188, + "epoch": 6.06, + "learning_rate": 2.1902883441344982e-05, + "loss": 0.6016, + "step": 7166, + "task_loss": 0.49969935417175293 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3173852264881134, + "epoch": 6.06, + "learning_rate": 2.189818728280267e-05, + "loss": 0.5807, + "step": 7167, + "task_loss": 0.3355173170566559 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4658110439777374, + "epoch": 6.06, + "learning_rate": 2.189349112426036e-05, + "loss": 0.7398, + "step": 7168, + "task_loss": 0.16004134714603424 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7698280811309814, + "epoch": 6.06, + "learning_rate": 2.188879496571804e-05, + "loss": 0.6663, + "step": 7169, + "task_loss": 0.9086529016494751 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6550474762916565, + "epoch": 6.06, + "learning_rate": 2.188409880717573e-05, + "loss": 0.5423, + "step": 7170, + "task_loss": 0.6860083937644958 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8589059114456177, + "epoch": 6.06, + "learning_rate": 2.1879402648633418e-05, + "loss": 0.6948, + "step": 7171, + "task_loss": 1.3490904569625854 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7704837322235107, + "epoch": 6.06, + "learning_rate": 2.1874706490091107e-05, + "loss": 0.8034, + "step": 7172, + "task_loss": 0.2552017867565155 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6756435632705688, + "epoch": 6.06, + "learning_rate": 2.1870010331548794e-05, + "loss": 0.906, + "step": 7173, + "task_loss": 1.29674232006073 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9244197607040405, + "epoch": 6.06, + "learning_rate": 2.1865314173006483e-05, + "loss": 0.9253, + "step": 7174, + "task_loss": 1.3732184171676636 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.874692440032959, + "epoch": 6.07, + "learning_rate": 2.186061801446417e-05, + "loss": 0.6775, + "step": 7175, + "task_loss": 0.6197335720062256 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6820969581604004, + "epoch": 6.07, + "learning_rate": 2.1855921855921856e-05, + "loss": 0.7291, + "step": 7176, + "task_loss": 0.25539979338645935 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.49988046288490295, + "epoch": 6.07, + "learning_rate": 2.1851225697379546e-05, + "loss": 0.6907, + "step": 7177, + "task_loss": 0.15416975319385529 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4537976384162903, + "epoch": 6.07, + "learning_rate": 2.1846529538837232e-05, + "loss": 0.7524, + "step": 7178, + "task_loss": 0.7771729230880737 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5168384909629822, + "epoch": 6.07, + "learning_rate": 2.1841833380294922e-05, + "loss": 0.8483, + "step": 7179, + "task_loss": 0.8032326698303223 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.1561343669891357, + "epoch": 6.07, + "learning_rate": 2.1837137221752608e-05, + "loss": 0.7695, + "step": 7180, + "task_loss": 1.2025516033172607 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6765623092651367, + "epoch": 6.07, + "learning_rate": 2.1832441063210295e-05, + "loss": 0.8174, + "step": 7181, + "task_loss": 0.9995985627174377 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.8926937580108643, + "epoch": 6.07, + "learning_rate": 2.182774490466798e-05, + "loss": 1.0419, + "step": 7182, + "task_loss": 1.49722421169281 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.47439974546432495, + "epoch": 6.07, + "learning_rate": 2.182304874612567e-05, + "loss": 0.7517, + "step": 7183, + "task_loss": 0.17238035798072815 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5642910599708557, + "epoch": 6.07, + "learning_rate": 2.1818352587583357e-05, + "loss": 0.8247, + "step": 7184, + "task_loss": 0.053178928792476654 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8397172093391418, + "epoch": 6.07, + "learning_rate": 2.1813656429041047e-05, + "loss": 0.7845, + "step": 7185, + "task_loss": 0.30938243865966797 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9758416414260864, + "epoch": 6.07, + "learning_rate": 2.1808960270498733e-05, + "loss": 0.8254, + "step": 7186, + "task_loss": 1.5168362855911255 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.49982646107673645, + "epoch": 6.08, + "learning_rate": 2.1804264111956423e-05, + "loss": 0.7394, + "step": 7187, + "task_loss": 0.7977765798568726 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.1147339344024658, + "epoch": 6.08, + "learning_rate": 2.1799567953414106e-05, + "loss": 0.7209, + "step": 7188, + "task_loss": 1.8341929912567139 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6717292666435242, + "epoch": 6.08, + "learning_rate": 2.1794871794871795e-05, + "loss": 0.6333, + "step": 7189, + "task_loss": 1.048740029335022 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7996200323104858, + "epoch": 6.08, + "learning_rate": 2.1790175636329482e-05, + "loss": 0.7675, + "step": 7190, + "task_loss": 0.9711360335350037 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.670913577079773, + "epoch": 6.08, + "learning_rate": 2.178547947778717e-05, + "loss": 0.5661, + "step": 7191, + "task_loss": 0.7828474044799805 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0462335348129272, + "epoch": 6.08, + "learning_rate": 2.1780783319244858e-05, + "loss": 0.8979, + "step": 7192, + "task_loss": 1.197798728942871 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.75876384973526, + "epoch": 6.08, + "learning_rate": 2.1776087160702548e-05, + "loss": 0.8777, + "step": 7193, + "task_loss": 0.5137035846710205 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7453130483627319, + "epoch": 6.08, + "learning_rate": 2.1771391002160234e-05, + "loss": 0.7199, + "step": 7194, + "task_loss": 0.4225083589553833 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9166837930679321, + "epoch": 6.08, + "learning_rate": 2.176669484361792e-05, + "loss": 0.6073, + "step": 7195, + "task_loss": 0.5939455032348633 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6556491851806641, + "epoch": 6.08, + "learning_rate": 2.176199868507561e-05, + "loss": 0.8144, + "step": 7196, + "task_loss": 0.9048523902893066 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.664811372756958, + "epoch": 6.08, + "learning_rate": 2.1757302526533296e-05, + "loss": 0.7984, + "step": 7197, + "task_loss": 0.41085103154182434 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4912843704223633, + "epoch": 6.08, + "learning_rate": 2.1752606367990986e-05, + "loss": 0.8464, + "step": 7198, + "task_loss": 0.7120546698570251 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4801912307739258, + "epoch": 6.09, + "learning_rate": 2.1747910209448672e-05, + "loss": 0.4609, + "step": 7199, + "task_loss": 0.8117266893386841 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3592599928379059, + "epoch": 6.09, + "learning_rate": 2.174321405090636e-05, + "loss": 0.497, + "step": 7200, + "task_loss": 0.3013739287853241 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5847285389900208, + "epoch": 6.09, + "learning_rate": 2.1738517892364045e-05, + "loss": 0.833, + "step": 7201, + "task_loss": 0.6599087715148926 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8118599653244019, + "epoch": 6.09, + "learning_rate": 2.1733821733821735e-05, + "loss": 0.8178, + "step": 7202, + "task_loss": 1.083342432975769 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4671546518802643, + "epoch": 6.09, + "learning_rate": 2.172912557527942e-05, + "loss": 0.5851, + "step": 7203, + "task_loss": 0.8996623158454895 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5797631144523621, + "epoch": 6.09, + "learning_rate": 2.172442941673711e-05, + "loss": 1.0265, + "step": 7204, + "task_loss": 1.4018033742904663 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.47824281454086304, + "epoch": 6.09, + "learning_rate": 2.1719733258194797e-05, + "loss": 0.8023, + "step": 7205, + "task_loss": 0.35927483439445496 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.952175498008728, + "epoch": 6.09, + "learning_rate": 2.1715037099652487e-05, + "loss": 0.8639, + "step": 7206, + "task_loss": 1.1770477294921875 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8245330452919006, + "epoch": 6.09, + "learning_rate": 2.1710340941110173e-05, + "loss": 0.8024, + "step": 7207, + "task_loss": 0.36155787110328674 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.49265801906585693, + "epoch": 6.09, + "learning_rate": 2.170564478256786e-05, + "loss": 0.7825, + "step": 7208, + "task_loss": 0.8571830987930298 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6904418468475342, + "epoch": 6.09, + "learning_rate": 2.170094862402555e-05, + "loss": 0.8175, + "step": 7209, + "task_loss": 0.6895142793655396 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6702935695648193, + "epoch": 6.09, + "learning_rate": 2.1696252465483236e-05, + "loss": 0.7629, + "step": 7210, + "task_loss": 0.7683337330818176 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.4230085611343384, + "epoch": 6.1, + "learning_rate": 2.1691556306940925e-05, + "loss": 0.8864, + "step": 7211, + "task_loss": 2.0329971313476562 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5686674118041992, + "epoch": 6.1, + "learning_rate": 2.1686860148398612e-05, + "loss": 0.586, + "step": 7212, + "task_loss": 0.7162639498710632 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0078449249267578, + "epoch": 6.1, + "learning_rate": 2.1682163989856298e-05, + "loss": 0.6551, + "step": 7213, + "task_loss": 0.7790558934211731 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9040428400039673, + "epoch": 6.1, + "learning_rate": 2.1677467831313984e-05, + "loss": 0.892, + "step": 7214, + "task_loss": 1.0883476734161377 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9948602914810181, + "epoch": 6.1, + "learning_rate": 2.1672771672771674e-05, + "loss": 0.8149, + "step": 7215, + "task_loss": 1.6493476629257202 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8038848638534546, + "epoch": 6.1, + "learning_rate": 2.166807551422936e-05, + "loss": 0.8258, + "step": 7216, + "task_loss": 1.3868155479431152 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.1387102603912354, + "epoch": 6.1, + "learning_rate": 2.166337935568705e-05, + "loss": 0.9815, + "step": 7217, + "task_loss": 1.17807936668396 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9181540012359619, + "epoch": 6.1, + "learning_rate": 2.1658683197144737e-05, + "loss": 0.6725, + "step": 7218, + "task_loss": 1.9234899282455444 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7684754133224487, + "epoch": 6.1, + "learning_rate": 2.1653987038602426e-05, + "loss": 0.7153, + "step": 7219, + "task_loss": 0.529316246509552 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5157217979431152, + "epoch": 6.1, + "learning_rate": 2.164929088006011e-05, + "loss": 0.628, + "step": 7220, + "task_loss": 0.08424042910337448 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.171029806137085, + "epoch": 6.1, + "learning_rate": 2.16445947215178e-05, + "loss": 0.6756, + "step": 7221, + "task_loss": 0.7613431811332703 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8572132587432861, + "epoch": 6.1, + "learning_rate": 2.1639898562975485e-05, + "loss": 0.74, + "step": 7222, + "task_loss": 0.8983513116836548 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6805492639541626, + "epoch": 6.11, + "learning_rate": 2.1635202404433175e-05, + "loss": 0.9395, + "step": 7223, + "task_loss": 0.6547703742980957 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8042693138122559, + "epoch": 6.11, + "learning_rate": 2.1630506245890865e-05, + "loss": 0.7344, + "step": 7224, + "task_loss": 1.053215503692627 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7065130472183228, + "epoch": 6.11, + "learning_rate": 2.162581008734855e-05, + "loss": 0.8014, + "step": 7225, + "task_loss": 1.6923737525939941 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6202841401100159, + "epoch": 6.11, + "learning_rate": 2.1621113928806237e-05, + "loss": 0.887, + "step": 7226, + "task_loss": 0.6131212711334229 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4195966124534607, + "epoch": 6.11, + "learning_rate": 2.1616417770263924e-05, + "loss": 0.7064, + "step": 7227, + "task_loss": 0.61259526014328 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8245047926902771, + "epoch": 6.11, + "learning_rate": 2.1611721611721613e-05, + "loss": 0.8428, + "step": 7228, + "task_loss": 0.6946379542350769 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.3024053573608398, + "epoch": 6.11, + "learning_rate": 2.16070254531793e-05, + "loss": 0.7953, + "step": 7229, + "task_loss": 1.2562988996505737 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7147060632705688, + "epoch": 6.11, + "learning_rate": 2.160232929463699e-05, + "loss": 0.8503, + "step": 7230, + "task_loss": 0.7056276202201843 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0790929794311523, + "epoch": 6.11, + "learning_rate": 2.1597633136094676e-05, + "loss": 0.7848, + "step": 7231, + "task_loss": 0.887088418006897 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5855640172958374, + "epoch": 6.11, + "learning_rate": 2.1592936977552362e-05, + "loss": 0.614, + "step": 7232, + "task_loss": 0.3032853901386261 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7523081302642822, + "epoch": 6.11, + "learning_rate": 2.158824081901005e-05, + "loss": 0.7284, + "step": 7233, + "task_loss": 0.7506266832351685 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5378092527389526, + "epoch": 6.11, + "learning_rate": 2.1583544660467738e-05, + "loss": 0.8681, + "step": 7234, + "task_loss": 0.697575569152832 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5744731426239014, + "epoch": 6.12, + "learning_rate": 2.1578848501925425e-05, + "loss": 0.8718, + "step": 7235, + "task_loss": 0.5974279642105103 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6487067341804504, + "epoch": 6.12, + "learning_rate": 2.1574152343383114e-05, + "loss": 0.6438, + "step": 7236, + "task_loss": 0.3642679452896118 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6631360054016113, + "epoch": 6.12, + "learning_rate": 2.15694561848408e-05, + "loss": 0.8855, + "step": 7237, + "task_loss": 1.20645272731781 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6382851004600525, + "epoch": 6.12, + "learning_rate": 2.156476002629849e-05, + "loss": 0.5294, + "step": 7238, + "task_loss": 0.5730342864990234 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6085051894187927, + "epoch": 6.12, + "learning_rate": 2.1560063867756177e-05, + "loss": 0.6526, + "step": 7239, + "task_loss": 1.1035147905349731 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0517230033874512, + "epoch": 6.12, + "learning_rate": 2.1555367709213863e-05, + "loss": 0.7708, + "step": 7240, + "task_loss": 0.6915375590324402 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7854740619659424, + "epoch": 6.12, + "learning_rate": 2.1550671550671553e-05, + "loss": 0.9201, + "step": 7241, + "task_loss": 0.6550876498222351 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8438715934753418, + "epoch": 6.12, + "learning_rate": 2.154597539212924e-05, + "loss": 0.9139, + "step": 7242, + "task_loss": 1.0264166593551636 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3967197835445404, + "epoch": 6.12, + "learning_rate": 2.154127923358693e-05, + "loss": 0.69, + "step": 7243, + "task_loss": 0.1629209965467453 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7615933418273926, + "epoch": 6.12, + "learning_rate": 2.1536583075044615e-05, + "loss": 0.7217, + "step": 7244, + "task_loss": 1.0993856191635132 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8833132386207581, + "epoch": 6.12, + "learning_rate": 2.15318869165023e-05, + "loss": 0.7274, + "step": 7245, + "task_loss": 0.7913080453872681 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7881261706352234, + "epoch": 6.13, + "learning_rate": 2.1527190757959988e-05, + "loss": 0.8763, + "step": 7246, + "task_loss": 1.7215936183929443 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6141406297683716, + "epoch": 6.13, + "learning_rate": 2.1522494599417678e-05, + "loss": 0.688, + "step": 7247, + "task_loss": 1.0577762126922607 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7764167189598083, + "epoch": 6.13, + "learning_rate": 2.1517798440875364e-05, + "loss": 0.7022, + "step": 7248, + "task_loss": 1.0579684972763062 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.828714907169342, + "epoch": 6.13, + "learning_rate": 2.1513102282333054e-05, + "loss": 0.5989, + "step": 7249, + "task_loss": 0.44812169671058655 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.174026370048523, + "epoch": 6.13, + "learning_rate": 2.150840612379074e-05, + "loss": 0.8788, + "step": 7250, + "task_loss": 0.8426690697669983 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7745528221130371, + "epoch": 6.13, + "learning_rate": 2.1503709965248426e-05, + "loss": 0.8316, + "step": 7251, + "task_loss": 1.0224412679672241 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.021286964416504, + "epoch": 6.13, + "learning_rate": 2.1499013806706113e-05, + "loss": 0.9524, + "step": 7252, + "task_loss": 1.03788161277771 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6251346468925476, + "epoch": 6.13, + "learning_rate": 2.1494317648163802e-05, + "loss": 0.7201, + "step": 7253, + "task_loss": 0.6775016188621521 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.1667661666870117, + "epoch": 6.13, + "learning_rate": 2.1489621489621492e-05, + "loss": 0.7936, + "step": 7254, + "task_loss": 0.571725606918335 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.49303075671195984, + "epoch": 6.13, + "learning_rate": 2.148492533107918e-05, + "loss": 0.7557, + "step": 7255, + "task_loss": 1.052638292312622 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4516313076019287, + "epoch": 6.13, + "learning_rate": 2.1480229172536868e-05, + "loss": 0.7646, + "step": 7256, + "task_loss": 0.3792729377746582 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.3801559209823608, + "epoch": 6.13, + "learning_rate": 2.1475533013994555e-05, + "loss": 0.6164, + "step": 7257, + "task_loss": 0.9194687604904175 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7355378866195679, + "epoch": 6.14, + "learning_rate": 2.147083685545224e-05, + "loss": 0.8276, + "step": 7258, + "task_loss": 1.2954992055892944 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6039822101593018, + "epoch": 6.14, + "learning_rate": 2.1466140696909927e-05, + "loss": 0.8629, + "step": 7259, + "task_loss": 0.857092559337616 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.1318070888519287, + "epoch": 6.14, + "learning_rate": 2.1461444538367617e-05, + "loss": 0.8385, + "step": 7260, + "task_loss": 1.4391353130340576 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0582722425460815, + "epoch": 6.14, + "learning_rate": 2.1456748379825303e-05, + "loss": 0.7809, + "step": 7261, + "task_loss": 0.7872764468193054 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5530414581298828, + "epoch": 6.14, + "learning_rate": 2.1452052221282993e-05, + "loss": 0.6073, + "step": 7262, + "task_loss": 0.10894644260406494 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.791254997253418, + "epoch": 6.14, + "learning_rate": 2.144735606274068e-05, + "loss": 0.7945, + "step": 7263, + "task_loss": 1.045981764793396 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3289176821708679, + "epoch": 6.14, + "learning_rate": 2.1442659904198366e-05, + "loss": 0.6833, + "step": 7264, + "task_loss": 0.9170638918876648 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6668128371238708, + "epoch": 6.14, + "learning_rate": 2.1437963745656052e-05, + "loss": 0.6903, + "step": 7265, + "task_loss": 1.2841181755065918 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6977182626724243, + "epoch": 6.14, + "learning_rate": 2.1433267587113742e-05, + "loss": 0.8821, + "step": 7266, + "task_loss": 1.0069397687911987 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.641633152961731, + "epoch": 6.14, + "learning_rate": 2.1428571428571428e-05, + "loss": 0.5638, + "step": 7267, + "task_loss": 1.2753231525421143 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.42851322889328003, + "epoch": 6.14, + "learning_rate": 2.1423875270029118e-05, + "loss": 0.5608, + "step": 7268, + "task_loss": 1.023807406425476 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8755781650543213, + "epoch": 6.14, + "learning_rate": 2.1419179111486804e-05, + "loss": 0.7999, + "step": 7269, + "task_loss": 0.3968278765678406 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4234994947910309, + "epoch": 6.15, + "learning_rate": 2.1414482952944494e-05, + "loss": 0.9036, + "step": 7270, + "task_loss": 0.09756513684988022 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.3662344217300415, + "epoch": 6.15, + "learning_rate": 2.140978679440218e-05, + "loss": 1.0458, + "step": 7271, + "task_loss": 0.9560425281524658 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9812257289886475, + "epoch": 6.15, + "learning_rate": 2.1405090635859867e-05, + "loss": 0.7054, + "step": 7272, + "task_loss": 0.7594797611236572 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6088255643844604, + "epoch": 6.15, + "learning_rate": 2.1400394477317556e-05, + "loss": 0.6196, + "step": 7273, + "task_loss": 0.28826162219047546 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0574575662612915, + "epoch": 6.15, + "learning_rate": 2.1395698318775243e-05, + "loss": 1.054, + "step": 7274, + "task_loss": 1.1559851169586182 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.5081233978271484, + "epoch": 6.15, + "learning_rate": 2.1391002160232932e-05, + "loss": 1.0462, + "step": 7275, + "task_loss": 2.451568126678467 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7172419428825378, + "epoch": 6.15, + "learning_rate": 2.138630600169062e-05, + "loss": 0.6038, + "step": 7276, + "task_loss": 1.4592489004135132 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8442765474319458, + "epoch": 6.15, + "learning_rate": 2.1381609843148305e-05, + "loss": 0.6626, + "step": 7277, + "task_loss": 0.7386364340782166 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6793843507766724, + "epoch": 6.15, + "learning_rate": 2.137691368460599e-05, + "loss": 0.5958, + "step": 7278, + "task_loss": 0.13768985867500305 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5622518062591553, + "epoch": 6.15, + "learning_rate": 2.137221752606368e-05, + "loss": 0.7642, + "step": 7279, + "task_loss": 0.46353909373283386 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8462156057357788, + "epoch": 6.15, + "learning_rate": 2.1367521367521368e-05, + "loss": 0.7459, + "step": 7280, + "task_loss": 0.8773670196533203 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.98182213306427, + "epoch": 6.15, + "learning_rate": 2.1362825208979057e-05, + "loss": 0.8802, + "step": 7281, + "task_loss": 0.34256017208099365 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6443221569061279, + "epoch": 6.16, + "learning_rate": 2.1358129050436744e-05, + "loss": 0.6565, + "step": 7282, + "task_loss": 0.39825159311294556 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6809701919555664, + "epoch": 6.16, + "learning_rate": 2.135343289189443e-05, + "loss": 0.7691, + "step": 7283, + "task_loss": 0.3650287091732025 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4708264172077179, + "epoch": 6.16, + "learning_rate": 2.1348736733352116e-05, + "loss": 0.6392, + "step": 7284, + "task_loss": 0.18303744494915009 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4702027142047882, + "epoch": 6.16, + "learning_rate": 2.1344040574809806e-05, + "loss": 0.6071, + "step": 7285, + "task_loss": 0.8118699789047241 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8072978258132935, + "epoch": 6.16, + "learning_rate": 2.1339344416267496e-05, + "loss": 0.8333, + "step": 7286, + "task_loss": 0.43348655104637146 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.212141990661621, + "epoch": 6.16, + "learning_rate": 2.1334648257725182e-05, + "loss": 0.8561, + "step": 7287, + "task_loss": 0.8420369625091553 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.653691291809082, + "epoch": 6.16, + "learning_rate": 2.1329952099182872e-05, + "loss": 0.7037, + "step": 7288, + "task_loss": 0.6111598610877991 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.2954797744750977, + "epoch": 6.16, + "learning_rate": 2.1325255940640558e-05, + "loss": 1.0747, + "step": 7289, + "task_loss": 1.3628555536270142 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5825004577636719, + "epoch": 6.16, + "learning_rate": 2.1320559782098244e-05, + "loss": 0.582, + "step": 7290, + "task_loss": 0.1854131817817688 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.414575457572937, + "epoch": 6.16, + "learning_rate": 2.131586362355593e-05, + "loss": 1.1293, + "step": 7291, + "task_loss": 1.0222036838531494 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.518298864364624, + "epoch": 6.16, + "learning_rate": 2.131116746501362e-05, + "loss": 0.8891, + "step": 7292, + "task_loss": 0.4502016305923462 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7490759491920471, + "epoch": 6.16, + "learning_rate": 2.1306471306471307e-05, + "loss": 0.7214, + "step": 7293, + "task_loss": 1.113375186920166 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6239113807678223, + "epoch": 6.17, + "learning_rate": 2.1301775147928997e-05, + "loss": 0.6411, + "step": 7294, + "task_loss": 0.47567692399024963 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8071575164794922, + "epoch": 6.17, + "learning_rate": 2.1297078989386683e-05, + "loss": 0.6916, + "step": 7295, + "task_loss": 0.6421604156494141 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.1491310596466064, + "epoch": 6.17, + "learning_rate": 2.129238283084437e-05, + "loss": 0.769, + "step": 7296, + "task_loss": 1.2147761583328247 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.048752784729004, + "epoch": 6.17, + "learning_rate": 2.1287686672302056e-05, + "loss": 0.6648, + "step": 7297, + "task_loss": 0.7053056955337524 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6138238906860352, + "epoch": 6.17, + "learning_rate": 2.1282990513759745e-05, + "loss": 0.665, + "step": 7298, + "task_loss": 0.7881291508674622 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7109013795852661, + "epoch": 6.17, + "learning_rate": 2.127829435521743e-05, + "loss": 0.5564, + "step": 7299, + "task_loss": 0.5207294821739197 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9928478002548218, + "epoch": 6.17, + "learning_rate": 2.127359819667512e-05, + "loss": 0.7403, + "step": 7300, + "task_loss": 0.6073621511459351 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.7343764305114746, + "epoch": 6.17, + "learning_rate": 2.126890203813281e-05, + "loss": 0.972, + "step": 7301, + "task_loss": 1.8446147441864014 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.29834824800491333, + "epoch": 6.17, + "learning_rate": 2.1264205879590497e-05, + "loss": 0.6227, + "step": 7302, + "task_loss": 0.30995550751686096 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.613131046295166, + "epoch": 6.17, + "learning_rate": 2.1259509721048184e-05, + "loss": 0.841, + "step": 7303, + "task_loss": 0.675408661365509 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4670330286026001, + "epoch": 6.17, + "learning_rate": 2.125481356250587e-05, + "loss": 0.5726, + "step": 7304, + "task_loss": 0.10194724053144455 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0692964792251587, + "epoch": 6.17, + "learning_rate": 2.125011740396356e-05, + "loss": 1.0102, + "step": 7305, + "task_loss": 2.002070188522339 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5963374972343445, + "epoch": 6.18, + "learning_rate": 2.1245421245421246e-05, + "loss": 0.7314, + "step": 7306, + "task_loss": 0.11792619526386261 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6287658214569092, + "epoch": 6.18, + "learning_rate": 2.1240725086878936e-05, + "loss": 0.746, + "step": 7307, + "task_loss": 0.8577568531036377 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.923244833946228, + "epoch": 6.18, + "learning_rate": 2.1236028928336622e-05, + "loss": 0.8022, + "step": 7308, + "task_loss": 1.819543719291687 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8460518717765808, + "epoch": 6.18, + "learning_rate": 2.123133276979431e-05, + "loss": 0.8748, + "step": 7309, + "task_loss": 1.0899124145507812 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6069716811180115, + "epoch": 6.18, + "learning_rate": 2.1226636611251995e-05, + "loss": 0.7937, + "step": 7310, + "task_loss": 0.11758802086114883 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9613973498344421, + "epoch": 6.18, + "learning_rate": 2.1221940452709685e-05, + "loss": 0.993, + "step": 7311, + "task_loss": 0.9018083810806274 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6727063655853271, + "epoch": 6.18, + "learning_rate": 2.121724429416737e-05, + "loss": 0.8886, + "step": 7312, + "task_loss": 0.650562584400177 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6873434782028198, + "epoch": 6.18, + "learning_rate": 2.121254813562506e-05, + "loss": 0.8502, + "step": 7313, + "task_loss": 0.6804630160331726 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.502631425857544, + "epoch": 6.18, + "learning_rate": 2.1207851977082747e-05, + "loss": 0.6542, + "step": 7314, + "task_loss": 0.3972668945789337 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0118098258972168, + "epoch": 6.18, + "learning_rate": 2.1203155818540433e-05, + "loss": 0.7148, + "step": 7315, + "task_loss": 1.3436074256896973 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7391782402992249, + "epoch": 6.18, + "learning_rate": 2.1198459659998123e-05, + "loss": 0.7943, + "step": 7316, + "task_loss": 0.8741737604141235 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5714748501777649, + "epoch": 6.19, + "learning_rate": 2.119376350145581e-05, + "loss": 0.6647, + "step": 7317, + "task_loss": 0.6296858787536621 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7975035905838013, + "epoch": 6.19, + "learning_rate": 2.11890673429135e-05, + "loss": 0.8271, + "step": 7318, + "task_loss": 1.2818903923034668 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0133475065231323, + "epoch": 6.19, + "learning_rate": 2.1184371184371186e-05, + "loss": 0.7281, + "step": 7319, + "task_loss": 0.6959531307220459 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8711767196655273, + "epoch": 6.19, + "learning_rate": 2.1179675025828875e-05, + "loss": 0.7315, + "step": 7320, + "task_loss": 1.5789134502410889 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8759069442749023, + "epoch": 6.19, + "learning_rate": 2.117497886728656e-05, + "loss": 1.0333, + "step": 7321, + "task_loss": 1.6512410640716553 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9129708409309387, + "epoch": 6.19, + "learning_rate": 2.1170282708744248e-05, + "loss": 0.843, + "step": 7322, + "task_loss": 1.2214851379394531 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5919798612594604, + "epoch": 6.19, + "learning_rate": 2.1165586550201934e-05, + "loss": 0.67, + "step": 7323, + "task_loss": 0.5314354300498962 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8062951564788818, + "epoch": 6.19, + "learning_rate": 2.1160890391659624e-05, + "loss": 0.6456, + "step": 7324, + "task_loss": 0.32059571146965027 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.773934006690979, + "epoch": 6.19, + "learning_rate": 2.115619423311731e-05, + "loss": 0.8783, + "step": 7325, + "task_loss": 0.9014705419540405 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5764439105987549, + "epoch": 6.19, + "learning_rate": 2.1151498074575e-05, + "loss": 0.6263, + "step": 7326, + "task_loss": 0.36131203174591064 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.39726966619491577, + "epoch": 6.19, + "learning_rate": 2.1146801916032686e-05, + "loss": 0.6354, + "step": 7327, + "task_loss": 0.2478232979774475 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5562845468521118, + "epoch": 6.19, + "learning_rate": 2.1142105757490373e-05, + "loss": 0.7944, + "step": 7328, + "task_loss": 0.9592318534851074 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.794558048248291, + "epoch": 6.2, + "learning_rate": 2.113740959894806e-05, + "loss": 0.8461, + "step": 7329, + "task_loss": 0.22421178221702576 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7193570137023926, + "epoch": 6.2, + "learning_rate": 2.113271344040575e-05, + "loss": 0.6712, + "step": 7330, + "task_loss": 0.3663038909435272 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5827070474624634, + "epoch": 6.2, + "learning_rate": 2.112801728186344e-05, + "loss": 0.8942, + "step": 7331, + "task_loss": 1.3067294359207153 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7363001704216003, + "epoch": 6.2, + "learning_rate": 2.1123321123321125e-05, + "loss": 0.7152, + "step": 7332, + "task_loss": 0.47304561734199524 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.228287696838379, + "epoch": 6.2, + "learning_rate": 2.1118624964778815e-05, + "loss": 0.8615, + "step": 7333, + "task_loss": 1.7882959842681885 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4802640676498413, + "epoch": 6.2, + "learning_rate": 2.1113928806236498e-05, + "loss": 0.6511, + "step": 7334, + "task_loss": 0.8797512650489807 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.42833617329597473, + "epoch": 6.2, + "learning_rate": 2.1109232647694187e-05, + "loss": 0.7442, + "step": 7335, + "task_loss": 0.3300752341747284 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6832612156867981, + "epoch": 6.2, + "learning_rate": 2.1104536489151874e-05, + "loss": 0.6556, + "step": 7336, + "task_loss": 0.6410014629364014 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0272929668426514, + "epoch": 6.2, + "learning_rate": 2.1099840330609563e-05, + "loss": 0.7582, + "step": 7337, + "task_loss": 1.223035216331482 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.598798394203186, + "epoch": 6.2, + "learning_rate": 2.109514417206725e-05, + "loss": 0.8003, + "step": 7338, + "task_loss": 0.9853284358978271 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7869447469711304, + "epoch": 6.2, + "learning_rate": 2.109044801352494e-05, + "loss": 0.5525, + "step": 7339, + "task_loss": 0.3112577199935913 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.830415666103363, + "epoch": 6.2, + "learning_rate": 2.1085751854982626e-05, + "loss": 0.9618, + "step": 7340, + "task_loss": 0.8471307754516602 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8166911005973816, + "epoch": 6.21, + "learning_rate": 2.1081055696440312e-05, + "loss": 0.7667, + "step": 7341, + "task_loss": 1.1074939966201782 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5692092180252075, + "epoch": 6.21, + "learning_rate": 2.1076359537898e-05, + "loss": 0.8185, + "step": 7342, + "task_loss": 0.8674144744873047 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8530000448226929, + "epoch": 6.21, + "learning_rate": 2.1071663379355688e-05, + "loss": 0.9748, + "step": 7343, + "task_loss": 0.7794963121414185 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6765753626823425, + "epoch": 6.21, + "learning_rate": 2.1066967220813375e-05, + "loss": 0.6717, + "step": 7344, + "task_loss": 0.48213571310043335 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7187105417251587, + "epoch": 6.21, + "learning_rate": 2.1062271062271064e-05, + "loss": 0.6571, + "step": 7345, + "task_loss": 0.33861497044563293 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.30975550413131714, + "epoch": 6.21, + "learning_rate": 2.105757490372875e-05, + "loss": 0.7254, + "step": 7346, + "task_loss": 0.9519387483596802 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.1694822311401367, + "epoch": 6.21, + "learning_rate": 2.1052878745186437e-05, + "loss": 0.8362, + "step": 7347, + "task_loss": 1.2636828422546387 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5459661483764648, + "epoch": 6.21, + "learning_rate": 2.1048182586644127e-05, + "loss": 0.7631, + "step": 7348, + "task_loss": 0.5094197988510132 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.1794116497039795, + "epoch": 6.21, + "learning_rate": 2.1043486428101813e-05, + "loss": 0.8282, + "step": 7349, + "task_loss": 2.7132728099823 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5219706296920776, + "epoch": 6.21, + "learning_rate": 2.1038790269559503e-05, + "loss": 0.8697, + "step": 7350, + "task_loss": 0.4655725955963135 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8996977806091309, + "epoch": 6.21, + "learning_rate": 2.103409411101719e-05, + "loss": 0.7549, + "step": 7351, + "task_loss": 0.7882678508758545 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8599838614463806, + "epoch": 6.21, + "learning_rate": 2.102939795247488e-05, + "loss": 0.7523, + "step": 7352, + "task_loss": 0.8759251832962036 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5591468811035156, + "epoch": 6.22, + "learning_rate": 2.1024701793932565e-05, + "loss": 0.7208, + "step": 7353, + "task_loss": 1.0729572772979736 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6783599853515625, + "epoch": 6.22, + "learning_rate": 2.102000563539025e-05, + "loss": 0.7947, + "step": 7354, + "task_loss": 1.139552354812622 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6930433511734009, + "epoch": 6.22, + "learning_rate": 2.1015309476847938e-05, + "loss": 0.7187, + "step": 7355, + "task_loss": 0.4436817169189453 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.683974027633667, + "epoch": 6.22, + "learning_rate": 2.1010613318305628e-05, + "loss": 0.6961, + "step": 7356, + "task_loss": 1.223222255706787 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.758063793182373, + "epoch": 6.22, + "learning_rate": 2.1005917159763314e-05, + "loss": 0.8119, + "step": 7357, + "task_loss": 0.8501352667808533 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5772619843482971, + "epoch": 6.22, + "learning_rate": 2.1001221001221004e-05, + "loss": 0.8086, + "step": 7358, + "task_loss": 0.38559019565582275 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6232973337173462, + "epoch": 6.22, + "learning_rate": 2.099652484267869e-05, + "loss": 0.5629, + "step": 7359, + "task_loss": 0.7940839529037476 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.870672881603241, + "epoch": 6.22, + "learning_rate": 2.0991828684136376e-05, + "loss": 0.6846, + "step": 7360, + "task_loss": 0.8545135259628296 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7082021832466125, + "epoch": 6.22, + "learning_rate": 2.0987132525594063e-05, + "loss": 0.7674, + "step": 7361, + "task_loss": 0.8793970346450806 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7765367031097412, + "epoch": 6.22, + "learning_rate": 2.0982436367051752e-05, + "loss": 0.8243, + "step": 7362, + "task_loss": 0.5434755086898804 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6026843190193176, + "epoch": 6.22, + "learning_rate": 2.0977740208509442e-05, + "loss": 0.7438, + "step": 7363, + "task_loss": 0.7945125102996826 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9115158319473267, + "epoch": 6.22, + "learning_rate": 2.097304404996713e-05, + "loss": 0.7572, + "step": 7364, + "task_loss": 0.7040691375732422 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5095666646957397, + "epoch": 6.23, + "learning_rate": 2.0968347891424818e-05, + "loss": 0.7362, + "step": 7365, + "task_loss": 1.0690009593963623 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6209613084793091, + "epoch": 6.23, + "learning_rate": 2.09636517328825e-05, + "loss": 0.7377, + "step": 7366, + "task_loss": 0.19138358533382416 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.430869460105896, + "epoch": 6.23, + "learning_rate": 2.095895557434019e-05, + "loss": 0.6664, + "step": 7367, + "task_loss": 1.1845269203186035 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6543041467666626, + "epoch": 6.23, + "learning_rate": 2.0954259415797877e-05, + "loss": 0.848, + "step": 7368, + "task_loss": 0.4298076033592224 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0701470375061035, + "epoch": 6.23, + "learning_rate": 2.0949563257255567e-05, + "loss": 0.6883, + "step": 7369, + "task_loss": 1.2839834690093994 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0643272399902344, + "epoch": 6.23, + "learning_rate": 2.0944867098713253e-05, + "loss": 0.9253, + "step": 7370, + "task_loss": 1.9272266626358032 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8384515643119812, + "epoch": 6.23, + "learning_rate": 2.0940170940170943e-05, + "loss": 0.7032, + "step": 7371, + "task_loss": 0.5410663485527039 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7600892782211304, + "epoch": 6.23, + "learning_rate": 2.093547478162863e-05, + "loss": 0.6635, + "step": 7372, + "task_loss": 0.5667957663536072 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.3808114528656006, + "epoch": 6.23, + "learning_rate": 2.0930778623086316e-05, + "loss": 0.904, + "step": 7373, + "task_loss": 1.5246059894561768 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9360626935958862, + "epoch": 6.23, + "learning_rate": 2.0926082464544002e-05, + "loss": 0.9408, + "step": 7374, + "task_loss": 1.1228195428848267 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5500826239585876, + "epoch": 6.23, + "learning_rate": 2.0921386306001692e-05, + "loss": 0.5733, + "step": 7375, + "task_loss": 0.426235556602478 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6655076146125793, + "epoch": 6.23, + "learning_rate": 2.0916690147459378e-05, + "loss": 0.9387, + "step": 7376, + "task_loss": 1.268262267112732 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.599971354007721, + "epoch": 6.24, + "learning_rate": 2.0911993988917068e-05, + "loss": 0.8371, + "step": 7377, + "task_loss": 0.5753202438354492 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8222448825836182, + "epoch": 6.24, + "learning_rate": 2.0907297830374754e-05, + "loss": 0.7653, + "step": 7378, + "task_loss": 1.6042546033859253 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7423411011695862, + "epoch": 6.24, + "learning_rate": 2.090260167183244e-05, + "loss": 0.7653, + "step": 7379, + "task_loss": 1.3295553922653198 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5579724311828613, + "epoch": 6.24, + "learning_rate": 2.089790551329013e-05, + "loss": 0.6765, + "step": 7380, + "task_loss": 0.6420096158981323 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7439407110214233, + "epoch": 6.24, + "learning_rate": 2.0893209354747817e-05, + "loss": 0.8627, + "step": 7381, + "task_loss": 1.2603936195373535 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.625250518321991, + "epoch": 6.24, + "learning_rate": 2.0888513196205506e-05, + "loss": 0.9104, + "step": 7382, + "task_loss": 1.421195149421692 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.085374116897583, + "epoch": 6.24, + "learning_rate": 2.0883817037663193e-05, + "loss": 0.8325, + "step": 7383, + "task_loss": 1.346087098121643 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5015108585357666, + "epoch": 6.24, + "learning_rate": 2.0879120879120882e-05, + "loss": 0.6582, + "step": 7384, + "task_loss": 0.5274398922920227 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6149212121963501, + "epoch": 6.24, + "learning_rate": 2.087442472057857e-05, + "loss": 0.7354, + "step": 7385, + "task_loss": 0.4828100800514221 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.745689332485199, + "epoch": 6.24, + "learning_rate": 2.0869728562036255e-05, + "loss": 0.5751, + "step": 7386, + "task_loss": 0.22957485914230347 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6897575855255127, + "epoch": 6.24, + "learning_rate": 2.086503240349394e-05, + "loss": 0.6512, + "step": 7387, + "task_loss": 1.0318617820739746 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8402003049850464, + "epoch": 6.24, + "learning_rate": 2.086033624495163e-05, + "loss": 0.806, + "step": 7388, + "task_loss": 0.33925187587738037 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9079195261001587, + "epoch": 6.25, + "learning_rate": 2.0855640086409317e-05, + "loss": 0.6746, + "step": 7389, + "task_loss": 0.630473256111145 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7542190551757812, + "epoch": 6.25, + "learning_rate": 2.0850943927867007e-05, + "loss": 0.7077, + "step": 7390, + "task_loss": 0.4988028109073639 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.562968373298645, + "epoch": 6.25, + "learning_rate": 2.0846247769324694e-05, + "loss": 0.716, + "step": 7391, + "task_loss": 0.6563072204589844 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.1461164951324463, + "epoch": 6.25, + "learning_rate": 2.084155161078238e-05, + "loss": 0.8346, + "step": 7392, + "task_loss": 1.3953099250793457 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6645389795303345, + "epoch": 6.25, + "learning_rate": 2.083685545224007e-05, + "loss": 0.5527, + "step": 7393, + "task_loss": 0.8259853720664978 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4294430613517761, + "epoch": 6.25, + "learning_rate": 2.0832159293697756e-05, + "loss": 0.641, + "step": 7394, + "task_loss": 0.765816867351532 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.34131571650505066, + "epoch": 6.25, + "learning_rate": 2.0827463135155446e-05, + "loss": 0.6477, + "step": 7395, + "task_loss": 0.3596201241016388 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.34524106979370117, + "epoch": 6.25, + "learning_rate": 2.0822766976613132e-05, + "loss": 0.6091, + "step": 7396, + "task_loss": 0.3914586305618286 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0210773944854736, + "epoch": 6.25, + "learning_rate": 2.0818070818070822e-05, + "loss": 0.7631, + "step": 7397, + "task_loss": 1.1279278993606567 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5434674620628357, + "epoch": 6.25, + "learning_rate": 2.0813374659528505e-05, + "loss": 0.775, + "step": 7398, + "task_loss": 0.45286065340042114 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8154321312904358, + "epoch": 6.25, + "learning_rate": 2.0808678500986194e-05, + "loss": 1.0322, + "step": 7399, + "task_loss": 2.0448100566864014 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5701747536659241, + "epoch": 6.26, + "learning_rate": 2.080398234244388e-05, + "loss": 0.7295, + "step": 7400, + "task_loss": 0.568591296672821 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6002664566040039, + "epoch": 6.26, + "learning_rate": 2.079928618390157e-05, + "loss": 0.7291, + "step": 7401, + "task_loss": 0.5425191521644592 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9865236878395081, + "epoch": 6.26, + "learning_rate": 2.0794590025359257e-05, + "loss": 0.8912, + "step": 7402, + "task_loss": 0.9727468490600586 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6687947511672974, + "epoch": 6.26, + "learning_rate": 2.0789893866816947e-05, + "loss": 0.7688, + "step": 7403, + "task_loss": 0.41799309849739075 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6444734334945679, + "epoch": 6.26, + "learning_rate": 2.0785197708274633e-05, + "loss": 0.6799, + "step": 7404, + "task_loss": 0.3296579122543335 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.1323192119598389, + "epoch": 6.26, + "learning_rate": 2.078050154973232e-05, + "loss": 0.8226, + "step": 7405, + "task_loss": 0.7677993774414062 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3203374147415161, + "epoch": 6.26, + "learning_rate": 2.0775805391190006e-05, + "loss": 0.7936, + "step": 7406, + "task_loss": 0.3963952362537384 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8323668241500854, + "epoch": 6.26, + "learning_rate": 2.0771109232647695e-05, + "loss": 0.6554, + "step": 7407, + "task_loss": 1.4125797748565674 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.85093754529953, + "epoch": 6.26, + "learning_rate": 2.0766413074105385e-05, + "loss": 0.8541, + "step": 7408, + "task_loss": 1.2057921886444092 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7733677625656128, + "epoch": 6.26, + "learning_rate": 2.076171691556307e-05, + "loss": 0.7217, + "step": 7409, + "task_loss": 1.016628384590149 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6102703213691711, + "epoch": 6.26, + "learning_rate": 2.0757020757020758e-05, + "loss": 0.8564, + "step": 7410, + "task_loss": 0.8334298729896545 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9523823261260986, + "epoch": 6.26, + "learning_rate": 2.0752324598478444e-05, + "loss": 0.7016, + "step": 7411, + "task_loss": 1.1367703676223755 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5028101801872253, + "epoch": 6.27, + "learning_rate": 2.0747628439936134e-05, + "loss": 0.6691, + "step": 7412, + "task_loss": 0.7603978514671326 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7649860382080078, + "epoch": 6.27, + "learning_rate": 2.074293228139382e-05, + "loss": 0.8223, + "step": 7413, + "task_loss": 1.6488134860992432 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8108687400817871, + "epoch": 6.27, + "learning_rate": 2.073823612285151e-05, + "loss": 0.7728, + "step": 7414, + "task_loss": 1.4445828199386597 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7168159484863281, + "epoch": 6.27, + "learning_rate": 2.0733539964309196e-05, + "loss": 0.6596, + "step": 7415, + "task_loss": 0.8484806418418884 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.1400716304779053, + "epoch": 6.27, + "learning_rate": 2.0728843805766886e-05, + "loss": 1.0189, + "step": 7416, + "task_loss": 2.321479320526123 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5252053737640381, + "epoch": 6.27, + "learning_rate": 2.072414764722457e-05, + "loss": 0.6466, + "step": 7417, + "task_loss": 0.5142242908477783 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0043941736221313, + "epoch": 6.27, + "learning_rate": 2.071945148868226e-05, + "loss": 0.76, + "step": 7418, + "task_loss": 0.7001596093177795 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8996866941452026, + "epoch": 6.27, + "learning_rate": 2.0714755330139945e-05, + "loss": 0.7252, + "step": 7419, + "task_loss": 0.3344026803970337 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9410960674285889, + "epoch": 6.27, + "learning_rate": 2.0710059171597635e-05, + "loss": 0.8433, + "step": 7420, + "task_loss": 1.3487679958343506 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5089696645736694, + "epoch": 6.27, + "learning_rate": 2.070536301305532e-05, + "loss": 0.6191, + "step": 7421, + "task_loss": 0.883463442325592 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8386927247047424, + "epoch": 6.27, + "learning_rate": 2.070066685451301e-05, + "loss": 0.7684, + "step": 7422, + "task_loss": 0.4113532304763794 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6311105489730835, + "epoch": 6.27, + "learning_rate": 2.0695970695970697e-05, + "loss": 0.8188, + "step": 7423, + "task_loss": 0.4659413993358612 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5875629186630249, + "epoch": 6.28, + "learning_rate": 2.0691274537428383e-05, + "loss": 0.7516, + "step": 7424, + "task_loss": 0.41164731979370117 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6897075772285461, + "epoch": 6.28, + "learning_rate": 2.0686578378886073e-05, + "loss": 0.645, + "step": 7425, + "task_loss": 0.5500578880310059 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7783454656600952, + "epoch": 6.28, + "learning_rate": 2.068188222034376e-05, + "loss": 0.8443, + "step": 7426, + "task_loss": 0.9478763937950134 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.40555626153945923, + "epoch": 6.28, + "learning_rate": 2.067718606180145e-05, + "loss": 0.6945, + "step": 7427, + "task_loss": 0.3701245188713074 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6588277816772461, + "epoch": 6.28, + "learning_rate": 2.0672489903259136e-05, + "loss": 0.6197, + "step": 7428, + "task_loss": 0.48090481758117676 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.46678662300109863, + "epoch": 6.28, + "learning_rate": 2.0667793744716822e-05, + "loss": 0.6836, + "step": 7429, + "task_loss": 0.31632208824157715 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7910106182098389, + "epoch": 6.28, + "learning_rate": 2.0663097586174508e-05, + "loss": 0.6653, + "step": 7430, + "task_loss": 0.2654785215854645 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7949224710464478, + "epoch": 6.28, + "learning_rate": 2.0658401427632198e-05, + "loss": 0.7778, + "step": 7431, + "task_loss": 1.1252044439315796 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6070295572280884, + "epoch": 6.28, + "learning_rate": 2.0653705269089884e-05, + "loss": 0.5675, + "step": 7432, + "task_loss": 0.6293492317199707 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5609884262084961, + "epoch": 6.28, + "learning_rate": 2.0649009110547574e-05, + "loss": 0.8402, + "step": 7433, + "task_loss": 0.5521501302719116 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9000744819641113, + "epoch": 6.28, + "learning_rate": 2.064431295200526e-05, + "loss": 0.9367, + "step": 7434, + "task_loss": 0.5535411834716797 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6844930648803711, + "epoch": 6.28, + "learning_rate": 2.063961679346295e-05, + "loss": 0.7689, + "step": 7435, + "task_loss": 0.306185245513916 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9600838422775269, + "epoch": 6.29, + "learning_rate": 2.0634920634920636e-05, + "loss": 0.8011, + "step": 7436, + "task_loss": 0.6257910132408142 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9888410568237305, + "epoch": 6.29, + "learning_rate": 2.0630224476378323e-05, + "loss": 0.6437, + "step": 7437, + "task_loss": 0.355661004781723 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.1719802618026733, + "epoch": 6.29, + "learning_rate": 2.062552831783601e-05, + "loss": 1.1153, + "step": 7438, + "task_loss": 1.41117262840271 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6061360836029053, + "epoch": 6.29, + "learning_rate": 2.06208321592937e-05, + "loss": 0.6338, + "step": 7439, + "task_loss": 0.8253005743026733 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.4684944152832031, + "epoch": 6.29, + "learning_rate": 2.061613600075139e-05, + "loss": 0.84, + "step": 7440, + "task_loss": 2.695319652557373 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5640551447868347, + "epoch": 6.29, + "learning_rate": 2.0611439842209075e-05, + "loss": 0.8043, + "step": 7441, + "task_loss": 0.8695645332336426 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6319279670715332, + "epoch": 6.29, + "learning_rate": 2.060674368366676e-05, + "loss": 0.8638, + "step": 7442, + "task_loss": 0.8624377250671387 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6750088930130005, + "epoch": 6.29, + "learning_rate": 2.0602047525124448e-05, + "loss": 0.6537, + "step": 7443, + "task_loss": 0.48857519030570984 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5105949640274048, + "epoch": 6.29, + "learning_rate": 2.0597351366582137e-05, + "loss": 0.5997, + "step": 7444, + "task_loss": 0.366051584482193 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7292328476905823, + "epoch": 6.29, + "learning_rate": 2.0592655208039824e-05, + "loss": 0.9565, + "step": 7445, + "task_loss": 0.4703395366668701 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6171258687973022, + "epoch": 6.29, + "learning_rate": 2.0587959049497513e-05, + "loss": 0.6895, + "step": 7446, + "task_loss": 0.8289067149162292 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8149582147598267, + "epoch": 6.29, + "learning_rate": 2.05832628909552e-05, + "loss": 0.7496, + "step": 7447, + "task_loss": 0.7453181743621826 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.3368885517120361, + "epoch": 6.3, + "learning_rate": 2.057856673241289e-05, + "loss": 0.9133, + "step": 7448, + "task_loss": 1.1894291639328003 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7083490490913391, + "epoch": 6.3, + "learning_rate": 2.0573870573870572e-05, + "loss": 0.6579, + "step": 7449, + "task_loss": 0.9935703277587891 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7421674728393555, + "epoch": 6.3, + "learning_rate": 2.0569174415328262e-05, + "loss": 1.0137, + "step": 7450, + "task_loss": 1.1948834657669067 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9806674122810364, + "epoch": 6.3, + "learning_rate": 2.056447825678595e-05, + "loss": 0.8114, + "step": 7451, + "task_loss": 0.7849366664886475 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7446266412734985, + "epoch": 6.3, + "learning_rate": 2.0559782098243638e-05, + "loss": 0.7426, + "step": 7452, + "task_loss": 0.24225614964962006 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9784231185913086, + "epoch": 6.3, + "learning_rate": 2.0555085939701324e-05, + "loss": 0.7389, + "step": 7453, + "task_loss": 1.2189289331436157 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.657981276512146, + "epoch": 6.3, + "learning_rate": 2.0550389781159014e-05, + "loss": 0.5278, + "step": 7454, + "task_loss": 0.6073331236839294 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8415519595146179, + "epoch": 6.3, + "learning_rate": 2.05456936226167e-05, + "loss": 0.8204, + "step": 7455, + "task_loss": 1.8905978202819824 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.767527163028717, + "epoch": 6.3, + "learning_rate": 2.0540997464074387e-05, + "loss": 0.6762, + "step": 7456, + "task_loss": 0.5372083783149719 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5876808166503906, + "epoch": 6.3, + "learning_rate": 2.0536301305532077e-05, + "loss": 0.6477, + "step": 7457, + "task_loss": 1.4048779010772705 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8592723608016968, + "epoch": 6.3, + "learning_rate": 2.0531605146989763e-05, + "loss": 0.6292, + "step": 7458, + "task_loss": 0.8719321489334106 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5751591920852661, + "epoch": 6.3, + "learning_rate": 2.0526908988447453e-05, + "loss": 0.7772, + "step": 7459, + "task_loss": 1.1154478788375854 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7422034740447998, + "epoch": 6.31, + "learning_rate": 2.052221282990514e-05, + "loss": 0.7891, + "step": 7460, + "task_loss": 0.7303847670555115 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7193036079406738, + "epoch": 6.31, + "learning_rate": 2.0517516671362825e-05, + "loss": 0.558, + "step": 7461, + "task_loss": 0.32498764991760254 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.1371911764144897, + "epoch": 6.31, + "learning_rate": 2.0512820512820512e-05, + "loss": 0.7194, + "step": 7462, + "task_loss": 1.1886568069458008 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.574236273765564, + "epoch": 6.31, + "learning_rate": 2.05081243542782e-05, + "loss": 0.8314, + "step": 7463, + "task_loss": 0.28935617208480835 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3699676990509033, + "epoch": 6.31, + "learning_rate": 2.0503428195735888e-05, + "loss": 0.6049, + "step": 7464, + "task_loss": 0.6173701882362366 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5916211605072021, + "epoch": 6.31, + "learning_rate": 2.0498732037193578e-05, + "loss": 0.803, + "step": 7465, + "task_loss": 0.29219841957092285 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5717437267303467, + "epoch": 6.31, + "learning_rate": 2.0494035878651264e-05, + "loss": 0.6655, + "step": 7466, + "task_loss": 0.25926673412323 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6863039135932922, + "epoch": 6.31, + "learning_rate": 2.0489339720108954e-05, + "loss": 0.803, + "step": 7467, + "task_loss": 0.697150468826294 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0390230417251587, + "epoch": 6.31, + "learning_rate": 2.0484643561566637e-05, + "loss": 0.7169, + "step": 7468, + "task_loss": 1.4959977865219116 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8238416910171509, + "epoch": 6.31, + "learning_rate": 2.0479947403024326e-05, + "loss": 1.0381, + "step": 7469, + "task_loss": 0.9743215441703796 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7851182222366333, + "epoch": 6.31, + "learning_rate": 2.0475251244482016e-05, + "loss": 0.8725, + "step": 7470, + "task_loss": 0.5519124269485474 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.554581880569458, + "epoch": 6.32, + "learning_rate": 2.0470555085939702e-05, + "loss": 0.8311, + "step": 7471, + "task_loss": 1.0280426740646362 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7475284337997437, + "epoch": 6.32, + "learning_rate": 2.0465858927397392e-05, + "loss": 0.7034, + "step": 7472, + "task_loss": 0.7715740203857422 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.48075205087661743, + "epoch": 6.32, + "learning_rate": 2.046116276885508e-05, + "loss": 0.6402, + "step": 7473, + "task_loss": 1.2195582389831543 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9182165861129761, + "epoch": 6.32, + "learning_rate": 2.0456466610312765e-05, + "loss": 0.7817, + "step": 7474, + "task_loss": 0.8701863884925842 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7879924774169922, + "epoch": 6.32, + "learning_rate": 2.045177045177045e-05, + "loss": 0.8162, + "step": 7475, + "task_loss": 0.6226803064346313 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9207605123519897, + "epoch": 6.32, + "learning_rate": 2.044707429322814e-05, + "loss": 0.8212, + "step": 7476, + "task_loss": 1.193554401397705 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3807130455970764, + "epoch": 6.32, + "learning_rate": 2.0442378134685827e-05, + "loss": 0.5516, + "step": 7477, + "task_loss": 0.7210559844970703 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0903037786483765, + "epoch": 6.32, + "learning_rate": 2.0437681976143517e-05, + "loss": 0.7389, + "step": 7478, + "task_loss": 0.9299229979515076 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.1902445554733276, + "epoch": 6.32, + "learning_rate": 2.0432985817601203e-05, + "loss": 0.7538, + "step": 7479, + "task_loss": 1.3490186929702759 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6994028687477112, + "epoch": 6.32, + "learning_rate": 2.0428289659058893e-05, + "loss": 0.7985, + "step": 7480, + "task_loss": 1.6950331926345825 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8571994304656982, + "epoch": 6.32, + "learning_rate": 2.0423593500516576e-05, + "loss": 0.7862, + "step": 7481, + "task_loss": 0.42559173703193665 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.857463538646698, + "epoch": 6.32, + "learning_rate": 2.0418897341974266e-05, + "loss": 0.7129, + "step": 7482, + "task_loss": 0.5072205066680908 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4534081220626831, + "epoch": 6.33, + "learning_rate": 2.0414201183431952e-05, + "loss": 0.5109, + "step": 7483, + "task_loss": 0.4049938917160034 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7866555452346802, + "epoch": 6.33, + "learning_rate": 2.040950502488964e-05, + "loss": 0.8483, + "step": 7484, + "task_loss": 1.0585737228393555 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.4317275285720825, + "epoch": 6.33, + "learning_rate": 2.040480886634733e-05, + "loss": 0.7575, + "step": 7485, + "task_loss": 0.8607675433158875 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4805961549282074, + "epoch": 6.33, + "learning_rate": 2.0400112707805018e-05, + "loss": 0.6726, + "step": 7486, + "task_loss": 0.9413197040557861 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6208993196487427, + "epoch": 6.33, + "learning_rate": 2.0395416549262704e-05, + "loss": 0.7515, + "step": 7487, + "task_loss": 0.7183432579040527 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5415467619895935, + "epoch": 6.33, + "learning_rate": 2.039072039072039e-05, + "loss": 0.5815, + "step": 7488, + "task_loss": 0.30218827724456787 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.26674285531044006, + "epoch": 6.33, + "learning_rate": 2.038602423217808e-05, + "loss": 0.4508, + "step": 7489, + "task_loss": 0.04002999886870384 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0022109746932983, + "epoch": 6.33, + "learning_rate": 2.0381328073635766e-05, + "loss": 0.7987, + "step": 7490, + "task_loss": 0.5550556778907776 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0105533599853516, + "epoch": 6.33, + "learning_rate": 2.0376631915093456e-05, + "loss": 0.9286, + "step": 7491, + "task_loss": 0.5031957030296326 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4203837215900421, + "epoch": 6.33, + "learning_rate": 2.0371935756551143e-05, + "loss": 0.6792, + "step": 7492, + "task_loss": 0.19901612401008606 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5406327247619629, + "epoch": 6.33, + "learning_rate": 2.036723959800883e-05, + "loss": 0.7482, + "step": 7493, + "task_loss": 0.7134935855865479 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3313896656036377, + "epoch": 6.33, + "learning_rate": 2.0362543439466515e-05, + "loss": 0.6097, + "step": 7494, + "task_loss": 0.7809665203094482 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.714168906211853, + "epoch": 6.34, + "learning_rate": 2.0357847280924205e-05, + "loss": 0.8793, + "step": 7495, + "task_loss": 0.9553563594818115 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5723742842674255, + "epoch": 6.34, + "learning_rate": 2.035315112238189e-05, + "loss": 0.7048, + "step": 7496, + "task_loss": 0.340175598859787 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.1017974615097046, + "epoch": 6.34, + "learning_rate": 2.034845496383958e-05, + "loss": 0.7591, + "step": 7497, + "task_loss": 1.0998488664627075 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5159154534339905, + "epoch": 6.34, + "learning_rate": 2.0343758805297267e-05, + "loss": 0.7451, + "step": 7498, + "task_loss": 0.5804292559623718 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8280482888221741, + "epoch": 6.34, + "learning_rate": 2.0339062646754957e-05, + "loss": 0.7072, + "step": 7499, + "task_loss": 0.7390979528427124 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7301377058029175, + "epoch": 6.34, + "learning_rate": 2.033436648821264e-05, + "loss": 0.7402, + "step": 7500, + "task_loss": 0.823592483997345 + }, + { + "epoch": 6.34, + "eval_accuracy": 0.8906138613861386, + "eval_loss": 0.48427197337150574, + "eval_runtime": 227.9954, + "eval_samples_per_second": 110.748, + "eval_steps_per_second": 0.868, + "step": 7500 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6065465211868286, + "epoch": 6.34, + "learning_rate": 2.032967032967033e-05, + "loss": 0.641, + "step": 7501, + "task_loss": 1.3591396808624268 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.071468710899353, + "epoch": 6.34, + "learning_rate": 2.032497417112802e-05, + "loss": 0.9143, + "step": 7502, + "task_loss": 0.5443183183670044 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6417160034179688, + "epoch": 6.34, + "learning_rate": 2.0320278012585706e-05, + "loss": 0.6334, + "step": 7503, + "task_loss": 0.4025523364543915 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6647366881370544, + "epoch": 6.34, + "learning_rate": 2.0315581854043396e-05, + "loss": 0.6823, + "step": 7504, + "task_loss": 0.39806410670280457 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5919621586799622, + "epoch": 6.34, + "learning_rate": 2.0310885695501082e-05, + "loss": 0.6561, + "step": 7505, + "task_loss": 0.36361443996429443 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7387390732765198, + "epoch": 6.34, + "learning_rate": 2.0306189536958768e-05, + "loss": 0.6673, + "step": 7506, + "task_loss": 1.030948281288147 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.49842870235443115, + "epoch": 6.35, + "learning_rate": 2.0301493378416455e-05, + "loss": 0.7408, + "step": 7507, + "task_loss": 0.414078950881958 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8158503770828247, + "epoch": 6.35, + "learning_rate": 2.0296797219874144e-05, + "loss": 0.8865, + "step": 7508, + "task_loss": 0.15955457091331482 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.42736566066741943, + "epoch": 6.35, + "learning_rate": 2.029210106133183e-05, + "loss": 0.7908, + "step": 7509, + "task_loss": 1.1600167751312256 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8904193639755249, + "epoch": 6.35, + "learning_rate": 2.028740490278952e-05, + "loss": 0.7694, + "step": 7510, + "task_loss": 0.6995530724525452 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.40880489349365234, + "epoch": 6.35, + "learning_rate": 2.0282708744247207e-05, + "loss": 0.6613, + "step": 7511, + "task_loss": 0.48771902918815613 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8147513270378113, + "epoch": 6.35, + "learning_rate": 2.0278012585704893e-05, + "loss": 0.629, + "step": 7512, + "task_loss": 0.6779705286026001 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5522522926330566, + "epoch": 6.35, + "learning_rate": 2.027331642716258e-05, + "loss": 0.7524, + "step": 7513, + "task_loss": 0.6143159866333008 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5842427611351013, + "epoch": 6.35, + "learning_rate": 2.026862026862027e-05, + "loss": 0.6785, + "step": 7514, + "task_loss": 0.37798845767974854 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5957674980163574, + "epoch": 6.35, + "learning_rate": 2.0263924110077955e-05, + "loss": 0.6384, + "step": 7515, + "task_loss": 1.0146468877792358 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8060710430145264, + "epoch": 6.35, + "learning_rate": 2.0259227951535645e-05, + "loss": 0.7518, + "step": 7516, + "task_loss": 0.7413665652275085 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5653182864189148, + "epoch": 6.35, + "learning_rate": 2.0254531792993335e-05, + "loss": 0.6608, + "step": 7517, + "task_loss": 0.18142037093639374 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.567977786064148, + "epoch": 6.35, + "learning_rate": 2.024983563445102e-05, + "loss": 0.6526, + "step": 7518, + "task_loss": 0.7732404470443726 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.1748316287994385, + "epoch": 6.36, + "learning_rate": 2.0245139475908708e-05, + "loss": 0.8862, + "step": 7519, + "task_loss": 0.6319252848625183 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.47125524282455444, + "epoch": 6.36, + "learning_rate": 2.0240443317366394e-05, + "loss": 0.868, + "step": 7520, + "task_loss": 0.39438170194625854 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6819478869438171, + "epoch": 6.36, + "learning_rate": 2.0235747158824084e-05, + "loss": 0.7978, + "step": 7521, + "task_loss": 1.2663321495056152 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9620743989944458, + "epoch": 6.36, + "learning_rate": 2.023105100028177e-05, + "loss": 1.0633, + "step": 7522, + "task_loss": 2.177733898162842 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.101214051246643, + "epoch": 6.36, + "learning_rate": 2.022635484173946e-05, + "loss": 0.7539, + "step": 7523, + "task_loss": 0.4256002902984619 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7569538950920105, + "epoch": 6.36, + "learning_rate": 2.0221658683197146e-05, + "loss": 0.8757, + "step": 7524, + "task_loss": 1.3909714221954346 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4687160551548004, + "epoch": 6.36, + "learning_rate": 2.0216962524654832e-05, + "loss": 0.7759, + "step": 7525, + "task_loss": 0.6716315150260925 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5712404251098633, + "epoch": 6.36, + "learning_rate": 2.021226636611252e-05, + "loss": 0.7075, + "step": 7526, + "task_loss": 0.9849943518638611 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7025734186172485, + "epoch": 6.36, + "learning_rate": 2.020757020757021e-05, + "loss": 0.6634, + "step": 7527, + "task_loss": 0.1741601675748825 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8958563804626465, + "epoch": 6.36, + "learning_rate": 2.0202874049027895e-05, + "loss": 0.6974, + "step": 7528, + "task_loss": 0.6891791820526123 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.3044697046279907, + "epoch": 6.36, + "learning_rate": 2.0198177890485585e-05, + "loss": 0.8575, + "step": 7529, + "task_loss": 1.4352550506591797 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0483877658843994, + "epoch": 6.36, + "learning_rate": 2.019348173194327e-05, + "loss": 0.8674, + "step": 7530, + "task_loss": 0.4578039050102234 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5349478125572205, + "epoch": 6.37, + "learning_rate": 2.018878557340096e-05, + "loss": 0.5485, + "step": 7531, + "task_loss": 0.18011540174484253 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.40835249423980713, + "epoch": 6.37, + "learning_rate": 2.0184089414858647e-05, + "loss": 0.7038, + "step": 7532, + "task_loss": 0.5932508111000061 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.719973087310791, + "epoch": 6.37, + "learning_rate": 2.0179393256316333e-05, + "loss": 0.7931, + "step": 7533, + "task_loss": 0.6570042967796326 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7316364645957947, + "epoch": 6.37, + "learning_rate": 2.0174697097774023e-05, + "loss": 0.5838, + "step": 7534, + "task_loss": 0.3978091776371002 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8342056274414062, + "epoch": 6.37, + "learning_rate": 2.017000093923171e-05, + "loss": 0.8349, + "step": 7535, + "task_loss": 1.0953102111816406 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0915167331695557, + "epoch": 6.37, + "learning_rate": 2.01653047806894e-05, + "loss": 0.9528, + "step": 7536, + "task_loss": 0.5630344152450562 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5937149524688721, + "epoch": 6.37, + "learning_rate": 2.0160608622147085e-05, + "loss": 0.7136, + "step": 7537, + "task_loss": 0.1944558173418045 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9827777147293091, + "epoch": 6.37, + "learning_rate": 2.0155912463604772e-05, + "loss": 0.8215, + "step": 7538, + "task_loss": 1.0903899669647217 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0574376583099365, + "epoch": 6.37, + "learning_rate": 2.0151216305062458e-05, + "loss": 0.9948, + "step": 7539, + "task_loss": 1.515382170677185 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7983678579330444, + "epoch": 6.37, + "learning_rate": 2.0146520146520148e-05, + "loss": 0.9417, + "step": 7540, + "task_loss": 0.48026594519615173 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4345841407775879, + "epoch": 6.37, + "learning_rate": 2.0141823987977834e-05, + "loss": 0.5429, + "step": 7541, + "task_loss": 0.822762668132782 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4187994599342346, + "epoch": 6.38, + "learning_rate": 2.0137127829435524e-05, + "loss": 0.5927, + "step": 7542, + "task_loss": 0.12753166258335114 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6323968172073364, + "epoch": 6.38, + "learning_rate": 2.013243167089321e-05, + "loss": 0.7228, + "step": 7543, + "task_loss": 0.6080396175384521 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8404750227928162, + "epoch": 6.38, + "learning_rate": 2.0127735512350897e-05, + "loss": 0.7509, + "step": 7544, + "task_loss": 0.5601015686988831 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5631935596466064, + "epoch": 6.38, + "learning_rate": 2.0123039353808583e-05, + "loss": 0.939, + "step": 7545, + "task_loss": 0.7316991090774536 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6040771007537842, + "epoch": 6.38, + "learning_rate": 2.0118343195266273e-05, + "loss": 0.6489, + "step": 7546, + "task_loss": 0.3803209066390991 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4479072093963623, + "epoch": 6.38, + "learning_rate": 2.0113647036723962e-05, + "loss": 0.6534, + "step": 7547, + "task_loss": 1.0161069631576538 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7445828318595886, + "epoch": 6.38, + "learning_rate": 2.010895087818165e-05, + "loss": 0.6282, + "step": 7548, + "task_loss": 0.8260242938995361 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6225458383560181, + "epoch": 6.38, + "learning_rate": 2.010425471963934e-05, + "loss": 0.7293, + "step": 7549, + "task_loss": 0.7846152186393738 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7853468060493469, + "epoch": 6.38, + "learning_rate": 2.0099558561097025e-05, + "loss": 0.5, + "step": 7550, + "task_loss": 0.36259153485298157 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.2956175804138184, + "epoch": 6.38, + "learning_rate": 2.009486240255471e-05, + "loss": 0.9392, + "step": 7551, + "task_loss": 1.1912652254104614 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5296987295150757, + "epoch": 6.38, + "learning_rate": 2.0090166244012397e-05, + "loss": 0.5891, + "step": 7552, + "task_loss": 0.8086316585540771 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.41401636600494385, + "epoch": 6.38, + "learning_rate": 2.0085470085470087e-05, + "loss": 0.6248, + "step": 7553, + "task_loss": 0.395498126745224 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.548647403717041, + "epoch": 6.39, + "learning_rate": 2.0080773926927774e-05, + "loss": 0.7016, + "step": 7554, + "task_loss": 1.0272201299667358 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.46696850657463074, + "epoch": 6.39, + "learning_rate": 2.0076077768385463e-05, + "loss": 0.7131, + "step": 7555, + "task_loss": 0.6116682291030884 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7245603799819946, + "epoch": 6.39, + "learning_rate": 2.007138160984315e-05, + "loss": 0.7812, + "step": 7556, + "task_loss": 0.7864206433296204 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6072379350662231, + "epoch": 6.39, + "learning_rate": 2.0066685451300836e-05, + "loss": 0.831, + "step": 7557, + "task_loss": 0.7088528871536255 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7353689074516296, + "epoch": 6.39, + "learning_rate": 2.0061989292758522e-05, + "loss": 0.8521, + "step": 7558, + "task_loss": 0.9674023389816284 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4925864636898041, + "epoch": 6.39, + "learning_rate": 2.0057293134216212e-05, + "loss": 0.7468, + "step": 7559, + "task_loss": 0.5418189764022827 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.3976247310638428, + "epoch": 6.39, + "learning_rate": 2.00525969756739e-05, + "loss": 0.8953, + "step": 7560, + "task_loss": 1.0477639436721802 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0549635887145996, + "epoch": 6.39, + "learning_rate": 2.0047900817131588e-05, + "loss": 0.8228, + "step": 7561, + "task_loss": 1.692091464996338 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6518211960792542, + "epoch": 6.39, + "learning_rate": 2.0043204658589278e-05, + "loss": 0.5542, + "step": 7562, + "task_loss": 0.5194807648658752 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7623116374015808, + "epoch": 6.39, + "learning_rate": 2.003850850004696e-05, + "loss": 0.9223, + "step": 7563, + "task_loss": 0.39547768235206604 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9316878318786621, + "epoch": 6.39, + "learning_rate": 2.003381234150465e-05, + "loss": 0.7812, + "step": 7564, + "task_loss": 1.363194227218628 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6729598641395569, + "epoch": 6.39, + "learning_rate": 2.0029116182962337e-05, + "loss": 0.7754, + "step": 7565, + "task_loss": 0.9899405837059021 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9512329697608948, + "epoch": 6.4, + "learning_rate": 2.0024420024420027e-05, + "loss": 0.8025, + "step": 7566, + "task_loss": 1.0848573446273804 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9794129133224487, + "epoch": 6.4, + "learning_rate": 2.0019723865877713e-05, + "loss": 0.799, + "step": 7567, + "task_loss": 0.671390950679779 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.792204737663269, + "epoch": 6.4, + "learning_rate": 2.0015027707335403e-05, + "loss": 0.7902, + "step": 7568, + "task_loss": 0.6581438779830933 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9649010896682739, + "epoch": 6.4, + "learning_rate": 2.001033154879309e-05, + "loss": 1.3135, + "step": 7569, + "task_loss": 0.9912058711051941 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5095652937889099, + "epoch": 6.4, + "learning_rate": 2.0005635390250775e-05, + "loss": 0.7887, + "step": 7570, + "task_loss": 0.6384296417236328 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7252198457717896, + "epoch": 6.4, + "learning_rate": 2.000093923170846e-05, + "loss": 0.8093, + "step": 7571, + "task_loss": 0.7928059101104736 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8304101228713989, + "epoch": 6.4, + "learning_rate": 1.999624307316615e-05, + "loss": 0.8171, + "step": 7572, + "task_loss": 0.8555334806442261 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.39439857006073, + "epoch": 6.4, + "learning_rate": 1.9991546914623838e-05, + "loss": 0.6802, + "step": 7573, + "task_loss": 0.6421467065811157 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9881793856620789, + "epoch": 6.4, + "learning_rate": 1.9986850756081527e-05, + "loss": 0.8473, + "step": 7574, + "task_loss": 0.7989943027496338 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.1185126304626465, + "epoch": 6.4, + "learning_rate": 1.9982154597539214e-05, + "loss": 0.8876, + "step": 7575, + "task_loss": 1.5409964323043823 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4071910083293915, + "epoch": 6.4, + "learning_rate": 1.99774584389969e-05, + "loss": 0.7298, + "step": 7576, + "task_loss": 0.3754138648509979 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.45496195554733276, + "epoch": 6.4, + "learning_rate": 1.9972762280454586e-05, + "loss": 0.8587, + "step": 7577, + "task_loss": 0.5419254302978516 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4706938862800598, + "epoch": 6.41, + "learning_rate": 1.9968066121912276e-05, + "loss": 0.6613, + "step": 7578, + "task_loss": 0.9052693843841553 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6920653581619263, + "epoch": 6.41, + "learning_rate": 1.9963369963369966e-05, + "loss": 0.6871, + "step": 7579, + "task_loss": 0.6808609962463379 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0918956995010376, + "epoch": 6.41, + "learning_rate": 1.9958673804827652e-05, + "loss": 0.7848, + "step": 7580, + "task_loss": 1.9416871070861816 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9899260401725769, + "epoch": 6.41, + "learning_rate": 1.9953977646285342e-05, + "loss": 0.7407, + "step": 7581, + "task_loss": 0.7347291707992554 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.35924428701400757, + "epoch": 6.41, + "learning_rate": 1.994928148774303e-05, + "loss": 0.5293, + "step": 7582, + "task_loss": 0.49180716276168823 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9731068015098572, + "epoch": 6.41, + "learning_rate": 1.9944585329200715e-05, + "loss": 0.6994, + "step": 7583, + "task_loss": 0.7984768748283386 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.800491213798523, + "epoch": 6.41, + "learning_rate": 1.99398891706584e-05, + "loss": 0.778, + "step": 7584, + "task_loss": 0.5808577537536621 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.2880233526229858, + "epoch": 6.41, + "learning_rate": 1.993519301211609e-05, + "loss": 0.7864, + "step": 7585, + "task_loss": 1.692447304725647 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6164824962615967, + "epoch": 6.41, + "learning_rate": 1.9930496853573777e-05, + "loss": 0.7138, + "step": 7586, + "task_loss": 0.7234996557235718 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6692668199539185, + "epoch": 6.41, + "learning_rate": 1.9925800695031467e-05, + "loss": 0.6475, + "step": 7587, + "task_loss": 1.4803858995437622 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6115835905075073, + "epoch": 6.41, + "learning_rate": 1.9921104536489153e-05, + "loss": 0.65, + "step": 7588, + "task_loss": 1.0782718658447266 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9879306554794312, + "epoch": 6.41, + "learning_rate": 1.991640837794684e-05, + "loss": 0.8429, + "step": 7589, + "task_loss": 1.0999752283096313 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.014695167541504, + "epoch": 6.42, + "learning_rate": 1.9911712219404526e-05, + "loss": 0.733, + "step": 7590, + "task_loss": 1.1705737113952637 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6471611261367798, + "epoch": 6.42, + "learning_rate": 1.9907016060862216e-05, + "loss": 0.7367, + "step": 7591, + "task_loss": 1.296729564666748 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.812690019607544, + "epoch": 6.42, + "learning_rate": 1.9902319902319902e-05, + "loss": 0.9234, + "step": 7592, + "task_loss": 0.6287636160850525 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5414273738861084, + "epoch": 6.42, + "learning_rate": 1.989762374377759e-05, + "loss": 0.6624, + "step": 7593, + "task_loss": 0.4580233097076416 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9044414758682251, + "epoch": 6.42, + "learning_rate": 1.989292758523528e-05, + "loss": 0.8567, + "step": 7594, + "task_loss": 1.8658394813537598 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.37194591760635376, + "epoch": 6.42, + "learning_rate": 1.9888231426692964e-05, + "loss": 0.5345, + "step": 7595, + "task_loss": 1.337389588356018 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5872178673744202, + "epoch": 6.42, + "learning_rate": 1.9883535268150654e-05, + "loss": 0.8213, + "step": 7596, + "task_loss": 0.6628521680831909 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.927291989326477, + "epoch": 6.42, + "learning_rate": 1.987883910960834e-05, + "loss": 0.9239, + "step": 7597, + "task_loss": 1.6592340469360352 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3163343071937561, + "epoch": 6.42, + "learning_rate": 1.987414295106603e-05, + "loss": 0.6493, + "step": 7598, + "task_loss": 0.39621442556381226 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7799478769302368, + "epoch": 6.42, + "learning_rate": 1.9869446792523716e-05, + "loss": 0.6848, + "step": 7599, + "task_loss": 1.5239591598510742 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0257673263549805, + "epoch": 6.42, + "learning_rate": 1.9864750633981406e-05, + "loss": 0.8307, + "step": 7600, + "task_loss": 1.2594103813171387 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0639734268188477, + "epoch": 6.42, + "learning_rate": 1.9860054475439093e-05, + "loss": 0.5799, + "step": 7601, + "task_loss": 2.3172764778137207 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6554515361785889, + "epoch": 6.43, + "learning_rate": 1.985535831689678e-05, + "loss": 0.5763, + "step": 7602, + "task_loss": 0.5721491575241089 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7856898307800293, + "epoch": 6.43, + "learning_rate": 1.9850662158354465e-05, + "loss": 0.8173, + "step": 7603, + "task_loss": 0.38338908553123474 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5960242748260498, + "epoch": 6.43, + "learning_rate": 1.9845965999812155e-05, + "loss": 0.7229, + "step": 7604, + "task_loss": 0.5853872299194336 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.019831657409668, + "epoch": 6.43, + "learning_rate": 1.984126984126984e-05, + "loss": 0.8199, + "step": 7605, + "task_loss": 0.946938693523407 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5050774216651917, + "epoch": 6.43, + "learning_rate": 1.983657368272753e-05, + "loss": 0.5934, + "step": 7606, + "task_loss": 0.7784441113471985 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8492951393127441, + "epoch": 6.43, + "learning_rate": 1.9831877524185217e-05, + "loss": 0.6883, + "step": 7607, + "task_loss": 1.4425342082977295 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9901495575904846, + "epoch": 6.43, + "learning_rate": 1.9827181365642904e-05, + "loss": 0.8387, + "step": 7608, + "task_loss": 0.914306104183197 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9025952219963074, + "epoch": 6.43, + "learning_rate": 1.9822485207100593e-05, + "loss": 0.6986, + "step": 7609, + "task_loss": 1.5781766176223755 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.3488155603408813, + "epoch": 6.43, + "learning_rate": 1.981778904855828e-05, + "loss": 0.7673, + "step": 7610, + "task_loss": 0.9815776348114014 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8924047946929932, + "epoch": 6.43, + "learning_rate": 1.981309289001597e-05, + "loss": 0.7199, + "step": 7611, + "task_loss": 1.5721698999404907 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5159244537353516, + "epoch": 6.43, + "learning_rate": 1.9808396731473656e-05, + "loss": 0.705, + "step": 7612, + "task_loss": 0.8294304013252258 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.601977527141571, + "epoch": 6.44, + "learning_rate": 1.9803700572931346e-05, + "loss": 0.6346, + "step": 7613, + "task_loss": 0.450419545173645 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9583681225776672, + "epoch": 6.44, + "learning_rate": 1.9799004414389032e-05, + "loss": 0.8706, + "step": 7614, + "task_loss": 0.9907078742980957 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0647671222686768, + "epoch": 6.44, + "learning_rate": 1.9794308255846718e-05, + "loss": 0.9387, + "step": 7615, + "task_loss": 0.9203618764877319 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7498989701271057, + "epoch": 6.44, + "learning_rate": 1.9789612097304405e-05, + "loss": 0.8229, + "step": 7616, + "task_loss": 1.3346717357635498 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.46873050928115845, + "epoch": 6.44, + "learning_rate": 1.9784915938762094e-05, + "loss": 0.6441, + "step": 7617, + "task_loss": 0.6018404960632324 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7259403467178345, + "epoch": 6.44, + "learning_rate": 1.978021978021978e-05, + "loss": 0.762, + "step": 7618, + "task_loss": 0.9145200252532959 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6513059139251709, + "epoch": 6.44, + "learning_rate": 1.977552362167747e-05, + "loss": 0.742, + "step": 7619, + "task_loss": 1.0814377069473267 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7161652445793152, + "epoch": 6.44, + "learning_rate": 1.9770827463135157e-05, + "loss": 0.783, + "step": 7620, + "task_loss": 1.3463190793991089 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.1232523918151855, + "epoch": 6.44, + "learning_rate": 1.9766131304592843e-05, + "loss": 0.6817, + "step": 7621, + "task_loss": 0.5128882527351379 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6046174168586731, + "epoch": 6.44, + "learning_rate": 1.976143514605053e-05, + "loss": 0.7081, + "step": 7622, + "task_loss": 1.371252417564392 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4807848036289215, + "epoch": 6.44, + "learning_rate": 1.975673898750822e-05, + "loss": 0.5205, + "step": 7623, + "task_loss": 1.3594398498535156 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7682082653045654, + "epoch": 6.44, + "learning_rate": 1.975204282896591e-05, + "loss": 0.5946, + "step": 7624, + "task_loss": 0.5053848028182983 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6603572368621826, + "epoch": 6.45, + "learning_rate": 1.9747346670423595e-05, + "loss": 0.9033, + "step": 7625, + "task_loss": 0.6079197525978088 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7668554186820984, + "epoch": 6.45, + "learning_rate": 1.9742650511881285e-05, + "loss": 0.7362, + "step": 7626, + "task_loss": 0.8268011808395386 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7177993655204773, + "epoch": 6.45, + "learning_rate": 1.9737954353338968e-05, + "loss": 0.7783, + "step": 7627, + "task_loss": 1.0116909742355347 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9775610566139221, + "epoch": 6.45, + "learning_rate": 1.9733258194796658e-05, + "loss": 0.8983, + "step": 7628, + "task_loss": 1.5301963090896606 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.607850193977356, + "epoch": 6.45, + "learning_rate": 1.9728562036254344e-05, + "loss": 0.5596, + "step": 7629, + "task_loss": 0.6180531978607178 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.704442024230957, + "epoch": 6.45, + "learning_rate": 1.9723865877712034e-05, + "loss": 0.8731, + "step": 7630, + "task_loss": 0.4971233010292053 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3705311417579651, + "epoch": 6.45, + "learning_rate": 1.971916971916972e-05, + "loss": 0.5788, + "step": 7631, + "task_loss": 0.24547626078128815 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5417773723602295, + "epoch": 6.45, + "learning_rate": 1.971447356062741e-05, + "loss": 0.7832, + "step": 7632, + "task_loss": 0.8369224071502686 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8113960027694702, + "epoch": 6.45, + "learning_rate": 1.9709777402085096e-05, + "loss": 0.6169, + "step": 7633, + "task_loss": 0.3220120668411255 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7441126704216003, + "epoch": 6.45, + "learning_rate": 1.9705081243542782e-05, + "loss": 0.6375, + "step": 7634, + "task_loss": 1.0451297760009766 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8870585560798645, + "epoch": 6.45, + "learning_rate": 1.970038508500047e-05, + "loss": 0.9103, + "step": 7635, + "task_loss": 0.7072421312332153 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7330875992774963, + "epoch": 6.45, + "learning_rate": 1.969568892645816e-05, + "loss": 0.6812, + "step": 7636, + "task_loss": 0.7836346626281738 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4489344656467438, + "epoch": 6.46, + "learning_rate": 1.9690992767915845e-05, + "loss": 0.609, + "step": 7637, + "task_loss": 0.028671870008111 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6549336314201355, + "epoch": 6.46, + "learning_rate": 1.9686296609373535e-05, + "loss": 0.7351, + "step": 7638, + "task_loss": 2.1135644912719727 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.1013147830963135, + "epoch": 6.46, + "learning_rate": 1.968160045083122e-05, + "loss": 1.0509, + "step": 7639, + "task_loss": 0.34713420271873474 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.41844791173934937, + "epoch": 6.46, + "learning_rate": 1.9676904292288907e-05, + "loss": 0.6898, + "step": 7640, + "task_loss": 1.0466957092285156 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9395298957824707, + "epoch": 6.46, + "learning_rate": 1.9672208133746597e-05, + "loss": 0.8717, + "step": 7641, + "task_loss": 1.1734822988510132 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.1004016399383545, + "epoch": 6.46, + "learning_rate": 1.9667511975204283e-05, + "loss": 0.7955, + "step": 7642, + "task_loss": 0.5363582372665405 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0975589752197266, + "epoch": 6.46, + "learning_rate": 1.9662815816661973e-05, + "loss": 0.9302, + "step": 7643, + "task_loss": 0.8820284605026245 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4563593566417694, + "epoch": 6.46, + "learning_rate": 1.965811965811966e-05, + "loss": 0.5336, + "step": 7644, + "task_loss": 0.36678874492645264 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6835901737213135, + "epoch": 6.46, + "learning_rate": 1.965342349957735e-05, + "loss": 0.6555, + "step": 7645, + "task_loss": 0.8172290325164795 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6804521083831787, + "epoch": 6.46, + "learning_rate": 1.9648727341035032e-05, + "loss": 0.833, + "step": 7646, + "task_loss": 0.6604913473129272 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6461050510406494, + "epoch": 6.46, + "learning_rate": 1.9644031182492722e-05, + "loss": 0.5992, + "step": 7647, + "task_loss": 0.25746679306030273 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3549998700618744, + "epoch": 6.46, + "learning_rate": 1.9639335023950408e-05, + "loss": 0.4828, + "step": 7648, + "task_loss": 0.084767185151577 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8378896713256836, + "epoch": 6.47, + "learning_rate": 1.9634638865408098e-05, + "loss": 0.6356, + "step": 7649, + "task_loss": 0.36759141087532043 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5671383142471313, + "epoch": 6.47, + "learning_rate": 1.9629942706865784e-05, + "loss": 0.6062, + "step": 7650, + "task_loss": 0.531984269618988 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.536716639995575, + "epoch": 6.47, + "learning_rate": 1.9625246548323474e-05, + "loss": 0.6329, + "step": 7651, + "task_loss": 0.46266689896583557 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9520303010940552, + "epoch": 6.47, + "learning_rate": 1.962055038978116e-05, + "loss": 0.898, + "step": 7652, + "task_loss": 1.0796414613723755 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.38990938663482666, + "epoch": 6.47, + "learning_rate": 1.9615854231238847e-05, + "loss": 0.6717, + "step": 7653, + "task_loss": 0.1859576255083084 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6438318490982056, + "epoch": 6.47, + "learning_rate": 1.9611158072696533e-05, + "loss": 0.8422, + "step": 7654, + "task_loss": 0.39792540669441223 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.946978747844696, + "epoch": 6.47, + "learning_rate": 1.9606461914154223e-05, + "loss": 0.7002, + "step": 7655, + "task_loss": 1.1868681907653809 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7481412887573242, + "epoch": 6.47, + "learning_rate": 1.9601765755611912e-05, + "loss": 0.5171, + "step": 7656, + "task_loss": 1.3572677373886108 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5795758962631226, + "epoch": 6.47, + "learning_rate": 1.95970695970696e-05, + "loss": 0.7549, + "step": 7657, + "task_loss": 0.6179518699645996 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.1502937078475952, + "epoch": 6.47, + "learning_rate": 1.9592373438527285e-05, + "loss": 0.8883, + "step": 7658, + "task_loss": 1.883871078491211 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6170938611030579, + "epoch": 6.47, + "learning_rate": 1.958767727998497e-05, + "loss": 0.5802, + "step": 7659, + "task_loss": 0.8290494084358215 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.43443092703819275, + "epoch": 6.47, + "learning_rate": 1.958298112144266e-05, + "loss": 0.7217, + "step": 7660, + "task_loss": 0.45499610900878906 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4020749032497406, + "epoch": 6.48, + "learning_rate": 1.9578284962900347e-05, + "loss": 0.5663, + "step": 7661, + "task_loss": 0.6695215106010437 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5470622181892395, + "epoch": 6.48, + "learning_rate": 1.9573588804358037e-05, + "loss": 0.6557, + "step": 7662, + "task_loss": 0.6010783910751343 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6080908179283142, + "epoch": 6.48, + "learning_rate": 1.9568892645815723e-05, + "loss": 0.5875, + "step": 7663, + "task_loss": 0.3884897232055664 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0029715299606323, + "epoch": 6.48, + "learning_rate": 1.9564196487273413e-05, + "loss": 0.9147, + "step": 7664, + "task_loss": 1.9032182693481445 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9333693981170654, + "epoch": 6.48, + "learning_rate": 1.95595003287311e-05, + "loss": 0.8198, + "step": 7665, + "task_loss": 1.3790310621261597 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0713704824447632, + "epoch": 6.48, + "learning_rate": 1.9554804170188786e-05, + "loss": 0.7448, + "step": 7666, + "task_loss": 0.8557037711143494 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5958260297775269, + "epoch": 6.48, + "learning_rate": 1.9550108011646472e-05, + "loss": 0.5718, + "step": 7667, + "task_loss": 0.33300381898880005 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5017598271369934, + "epoch": 6.48, + "learning_rate": 1.9545411853104162e-05, + "loss": 0.5906, + "step": 7668, + "task_loss": 0.6519893407821655 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.36946403980255127, + "epoch": 6.48, + "learning_rate": 1.954071569456185e-05, + "loss": 0.5557, + "step": 7669, + "task_loss": 0.36470916867256165 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6212729215621948, + "epoch": 6.48, + "learning_rate": 1.9536019536019538e-05, + "loss": 0.7272, + "step": 7670, + "task_loss": 0.8073986172676086 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8842863440513611, + "epoch": 6.48, + "learning_rate": 1.9531323377477224e-05, + "loss": 1.0034, + "step": 7671, + "task_loss": 0.7476460933685303 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5323085784912109, + "epoch": 6.48, + "learning_rate": 1.952662721893491e-05, + "loss": 0.5975, + "step": 7672, + "task_loss": 1.1198843717575073 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5339580774307251, + "epoch": 6.49, + "learning_rate": 1.95219310603926e-05, + "loss": 0.5262, + "step": 7673, + "task_loss": 0.4230917990207672 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.752091109752655, + "epoch": 6.49, + "learning_rate": 1.9517234901850287e-05, + "loss": 0.7387, + "step": 7674, + "task_loss": 0.4099613428115845 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8076943159103394, + "epoch": 6.49, + "learning_rate": 1.9512538743307977e-05, + "loss": 0.7939, + "step": 7675, + "task_loss": 0.9151133894920349 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6077210307121277, + "epoch": 6.49, + "learning_rate": 1.9507842584765663e-05, + "loss": 0.6888, + "step": 7676, + "task_loss": 0.46267110109329224 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9182029962539673, + "epoch": 6.49, + "learning_rate": 1.9503146426223353e-05, + "loss": 0.7513, + "step": 7677, + "task_loss": 0.6592177748680115 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9368127584457397, + "epoch": 6.49, + "learning_rate": 1.9498450267681036e-05, + "loss": 0.8257, + "step": 7678, + "task_loss": 0.7744563221931458 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.040683388710022, + "epoch": 6.49, + "learning_rate": 1.9493754109138725e-05, + "loss": 0.9294, + "step": 7679, + "task_loss": 1.0319582223892212 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7262957096099854, + "epoch": 6.49, + "learning_rate": 1.948905795059641e-05, + "loss": 1.02, + "step": 7680, + "task_loss": 1.0110260248184204 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3978745937347412, + "epoch": 6.49, + "learning_rate": 1.94843617920541e-05, + "loss": 0.631, + "step": 7681, + "task_loss": 0.45489755272865295 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6194429993629456, + "epoch": 6.49, + "learning_rate": 1.9479665633511788e-05, + "loss": 0.7523, + "step": 7682, + "task_loss": 0.9386467933654785 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6541781425476074, + "epoch": 6.49, + "learning_rate": 1.9474969474969477e-05, + "loss": 0.7375, + "step": 7683, + "task_loss": 0.14900720119476318 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7662619948387146, + "epoch": 6.5, + "learning_rate": 1.9470273316427164e-05, + "loss": 0.7945, + "step": 7684, + "task_loss": 1.5367803573608398 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.974721372127533, + "epoch": 6.5, + "learning_rate": 1.946557715788485e-05, + "loss": 0.7873, + "step": 7685, + "task_loss": 1.3970459699630737 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.744037389755249, + "epoch": 6.5, + "learning_rate": 1.946088099934254e-05, + "loss": 0.7389, + "step": 7686, + "task_loss": 0.7892038822174072 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5587062835693359, + "epoch": 6.5, + "learning_rate": 1.9456184840800226e-05, + "loss": 0.6255, + "step": 7687, + "task_loss": 0.8366269469261169 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7333464622497559, + "epoch": 6.5, + "learning_rate": 1.9451488682257916e-05, + "loss": 0.657, + "step": 7688, + "task_loss": 0.7022040486335754 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5175093412399292, + "epoch": 6.5, + "learning_rate": 1.9446792523715602e-05, + "loss": 0.5341, + "step": 7689, + "task_loss": 0.993614673614502 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.4450823068618774, + "epoch": 6.5, + "learning_rate": 1.944209636517329e-05, + "loss": 0.865, + "step": 7690, + "task_loss": 1.473414421081543 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.2266720533370972, + "epoch": 6.5, + "learning_rate": 1.9437400206630975e-05, + "loss": 0.797, + "step": 7691, + "task_loss": 1.6193474531173706 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.39499473571777344, + "epoch": 6.5, + "learning_rate": 1.9432704048088665e-05, + "loss": 0.7253, + "step": 7692, + "task_loss": 0.14277000725269318 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8442603349685669, + "epoch": 6.5, + "learning_rate": 1.942800788954635e-05, + "loss": 0.7514, + "step": 7693, + "task_loss": 0.5949330925941467 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6672244071960449, + "epoch": 6.5, + "learning_rate": 1.942331173100404e-05, + "loss": 0.733, + "step": 7694, + "task_loss": 0.6044737100601196 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3811165988445282, + "epoch": 6.5, + "learning_rate": 1.9418615572461727e-05, + "loss": 0.6609, + "step": 7695, + "task_loss": 0.19134503602981567 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9168832898139954, + "epoch": 6.51, + "learning_rate": 1.9413919413919417e-05, + "loss": 0.8501, + "step": 7696, + "task_loss": 1.3544936180114746 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8563758134841919, + "epoch": 6.51, + "learning_rate": 1.9409223255377103e-05, + "loss": 0.7715, + "step": 7697, + "task_loss": 1.2832733392715454 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8305966258049011, + "epoch": 6.51, + "learning_rate": 1.940452709683479e-05, + "loss": 0.6625, + "step": 7698, + "task_loss": 1.1045334339141846 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3913740813732147, + "epoch": 6.51, + "learning_rate": 1.9399830938292476e-05, + "loss": 0.6396, + "step": 7699, + "task_loss": 0.26936590671539307 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5535858273506165, + "epoch": 6.51, + "learning_rate": 1.9395134779750165e-05, + "loss": 0.6665, + "step": 7700, + "task_loss": 0.49178457260131836 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7849540114402771, + "epoch": 6.51, + "learning_rate": 1.9390438621207855e-05, + "loss": 0.9252, + "step": 7701, + "task_loss": 1.8992985486984253 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8047544956207275, + "epoch": 6.51, + "learning_rate": 1.938574246266554e-05, + "loss": 0.7258, + "step": 7702, + "task_loss": 0.8615409135818481 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5726311206817627, + "epoch": 6.51, + "learning_rate": 1.9381046304123228e-05, + "loss": 0.804, + "step": 7703, + "task_loss": 0.5708454251289368 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7503408193588257, + "epoch": 6.51, + "learning_rate": 1.9376350145580914e-05, + "loss": 0.8292, + "step": 7704, + "task_loss": 0.9339148998260498 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7210506200790405, + "epoch": 6.51, + "learning_rate": 1.9371653987038604e-05, + "loss": 0.6455, + "step": 7705, + "task_loss": 0.7204990983009338 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8715714812278748, + "epoch": 6.51, + "learning_rate": 1.936695782849629e-05, + "loss": 0.6506, + "step": 7706, + "task_loss": 0.176946759223938 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5305806398391724, + "epoch": 6.51, + "learning_rate": 1.936226166995398e-05, + "loss": 0.5084, + "step": 7707, + "task_loss": 0.6959893107414246 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8788647055625916, + "epoch": 6.52, + "learning_rate": 1.9357565511411666e-05, + "loss": 0.819, + "step": 7708, + "task_loss": 1.4055845737457275 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0391018390655518, + "epoch": 6.52, + "learning_rate": 1.9352869352869356e-05, + "loss": 0.7539, + "step": 7709, + "task_loss": 0.9696717262268066 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7876948118209839, + "epoch": 6.52, + "learning_rate": 1.934817319432704e-05, + "loss": 0.7556, + "step": 7710, + "task_loss": 1.4222209453582764 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0485601425170898, + "epoch": 6.52, + "learning_rate": 1.934347703578473e-05, + "loss": 1.0434, + "step": 7711, + "task_loss": 0.38439807295799255 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6688847541809082, + "epoch": 6.52, + "learning_rate": 1.9338780877242415e-05, + "loss": 0.8281, + "step": 7712, + "task_loss": 0.9486384391784668 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5378887057304382, + "epoch": 6.52, + "learning_rate": 1.9334084718700105e-05, + "loss": 0.6378, + "step": 7713, + "task_loss": 0.5738407969474792 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0102933645248413, + "epoch": 6.52, + "learning_rate": 1.932938856015779e-05, + "loss": 0.7185, + "step": 7714, + "task_loss": 0.6165779232978821 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8767176866531372, + "epoch": 6.52, + "learning_rate": 1.932469240161548e-05, + "loss": 0.8425, + "step": 7715, + "task_loss": 0.9264959692955017 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7541339993476868, + "epoch": 6.52, + "learning_rate": 1.9319996243073167e-05, + "loss": 0.6826, + "step": 7716, + "task_loss": 0.9786801934242249 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3892976939678192, + "epoch": 6.52, + "learning_rate": 1.9315300084530854e-05, + "loss": 0.6385, + "step": 7717, + "task_loss": 0.15411044657230377 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6396268606185913, + "epoch": 6.52, + "learning_rate": 1.9310603925988543e-05, + "loss": 0.9173, + "step": 7718, + "task_loss": 0.6852754354476929 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9400156140327454, + "epoch": 6.52, + "learning_rate": 1.930590776744623e-05, + "loss": 0.8959, + "step": 7719, + "task_loss": 1.1729614734649658 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6031107902526855, + "epoch": 6.53, + "learning_rate": 1.930121160890392e-05, + "loss": 0.6377, + "step": 7720, + "task_loss": 0.9545750617980957 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7786155939102173, + "epoch": 6.53, + "learning_rate": 1.9296515450361606e-05, + "loss": 0.8014, + "step": 7721, + "task_loss": 0.841705322265625 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.2996140718460083, + "epoch": 6.53, + "learning_rate": 1.9291819291819292e-05, + "loss": 1.2648, + "step": 7722, + "task_loss": 1.480513334274292 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5905527472496033, + "epoch": 6.53, + "learning_rate": 1.928712313327698e-05, + "loss": 0.5655, + "step": 7723, + "task_loss": 0.2887874245643616 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5376182794570923, + "epoch": 6.53, + "learning_rate": 1.9282426974734668e-05, + "loss": 0.7498, + "step": 7724, + "task_loss": 0.644962728023529 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5927246809005737, + "epoch": 6.53, + "learning_rate": 1.9277730816192354e-05, + "loss": 0.7216, + "step": 7725, + "task_loss": 0.8949723839759827 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9754860401153564, + "epoch": 6.53, + "learning_rate": 1.9273034657650044e-05, + "loss": 0.6375, + "step": 7726, + "task_loss": 0.4873080253601074 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8372031450271606, + "epoch": 6.53, + "learning_rate": 1.926833849910773e-05, + "loss": 0.751, + "step": 7727, + "task_loss": 0.8517493605613708 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5863825678825378, + "epoch": 6.53, + "learning_rate": 1.926364234056542e-05, + "loss": 0.6045, + "step": 7728, + "task_loss": 0.3812902867794037 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5398223400115967, + "epoch": 6.53, + "learning_rate": 1.9258946182023103e-05, + "loss": 0.6573, + "step": 7729, + "task_loss": 1.1849589347839355 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6727845072746277, + "epoch": 6.53, + "learning_rate": 1.9254250023480793e-05, + "loss": 0.6766, + "step": 7730, + "task_loss": 0.3228757083415985 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8819069862365723, + "epoch": 6.53, + "learning_rate": 1.924955386493848e-05, + "loss": 0.7165, + "step": 7731, + "task_loss": 0.5985199213027954 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4298640489578247, + "epoch": 6.54, + "learning_rate": 1.924485770639617e-05, + "loss": 0.6732, + "step": 7732, + "task_loss": 0.815597414970398 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7423425912857056, + "epoch": 6.54, + "learning_rate": 1.924016154785386e-05, + "loss": 0.7915, + "step": 7733, + "task_loss": 0.6121283769607544 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7003265023231506, + "epoch": 6.54, + "learning_rate": 1.9235465389311545e-05, + "loss": 0.6296, + "step": 7734, + "task_loss": 0.31271976232528687 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5254868865013123, + "epoch": 6.54, + "learning_rate": 1.923076923076923e-05, + "loss": 0.7349, + "step": 7735, + "task_loss": 0.4855116009712219 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.43674591183662415, + "epoch": 6.54, + "learning_rate": 1.9226073072226918e-05, + "loss": 0.4792, + "step": 7736, + "task_loss": 0.9132821559906006 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7891735434532166, + "epoch": 6.54, + "learning_rate": 1.9221376913684607e-05, + "loss": 0.6492, + "step": 7737, + "task_loss": 0.7477094531059265 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.2180988788604736, + "epoch": 6.54, + "learning_rate": 1.9216680755142294e-05, + "loss": 0.7319, + "step": 7738, + "task_loss": 0.8285298943519592 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7939982414245605, + "epoch": 6.54, + "learning_rate": 1.9211984596599984e-05, + "loss": 0.6477, + "step": 7739, + "task_loss": 0.7834553718566895 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6451586484909058, + "epoch": 6.54, + "learning_rate": 1.920728843805767e-05, + "loss": 0.6704, + "step": 7740, + "task_loss": 0.8189336657524109 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8612432479858398, + "epoch": 6.54, + "learning_rate": 1.9202592279515356e-05, + "loss": 0.6677, + "step": 7741, + "task_loss": 1.3176567554473877 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.34695884585380554, + "epoch": 6.54, + "learning_rate": 1.9197896120973043e-05, + "loss": 0.5649, + "step": 7742, + "task_loss": 0.2803002595901489 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3164924085140228, + "epoch": 6.54, + "learning_rate": 1.9193199962430732e-05, + "loss": 0.5076, + "step": 7743, + "task_loss": 0.2756063640117645 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0373148918151855, + "epoch": 6.55, + "learning_rate": 1.918850380388842e-05, + "loss": 0.7185, + "step": 7744, + "task_loss": 0.4706798195838928 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.391814261674881, + "epoch": 6.55, + "learning_rate": 1.918380764534611e-05, + "loss": 0.7298, + "step": 7745, + "task_loss": 0.3161371946334839 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7492378950119019, + "epoch": 6.55, + "learning_rate": 1.9179111486803795e-05, + "loss": 0.9686, + "step": 7746, + "task_loss": 0.33143195509910583 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.2096893787384033, + "epoch": 6.55, + "learning_rate": 1.9174415328261484e-05, + "loss": 0.9531, + "step": 7747, + "task_loss": 1.294621229171753 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5983145833015442, + "epoch": 6.55, + "learning_rate": 1.916971916971917e-05, + "loss": 0.8284, + "step": 7748, + "task_loss": 1.633280634880066 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7714205384254456, + "epoch": 6.55, + "learning_rate": 1.9165023011176857e-05, + "loss": 0.82, + "step": 7749, + "task_loss": 0.4833499789237976 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5647680163383484, + "epoch": 6.55, + "learning_rate": 1.9160326852634547e-05, + "loss": 0.63, + "step": 7750, + "task_loss": 1.170346975326538 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4931541085243225, + "epoch": 6.55, + "learning_rate": 1.9155630694092233e-05, + "loss": 0.5985, + "step": 7751, + "task_loss": 0.4870527684688568 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9296172857284546, + "epoch": 6.55, + "learning_rate": 1.9150934535549923e-05, + "loss": 0.7688, + "step": 7752, + "task_loss": 1.282664179801941 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7493938207626343, + "epoch": 6.55, + "learning_rate": 1.914623837700761e-05, + "loss": 0.7958, + "step": 7753, + "task_loss": 1.1350146532058716 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7067694664001465, + "epoch": 6.55, + "learning_rate": 1.9141542218465296e-05, + "loss": 0.7399, + "step": 7754, + "task_loss": 0.8203844428062439 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9400995969772339, + "epoch": 6.56, + "learning_rate": 1.9136846059922982e-05, + "loss": 0.8157, + "step": 7755, + "task_loss": 0.8471134305000305 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7401766180992126, + "epoch": 6.56, + "learning_rate": 1.913214990138067e-05, + "loss": 0.8162, + "step": 7756, + "task_loss": 1.1807582378387451 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4688177704811096, + "epoch": 6.56, + "learning_rate": 1.9127453742838358e-05, + "loss": 0.793, + "step": 7757, + "task_loss": 0.921956479549408 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5760269165039062, + "epoch": 6.56, + "learning_rate": 1.9122757584296048e-05, + "loss": 0.69, + "step": 7758, + "task_loss": 0.6832701563835144 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.45360803604125977, + "epoch": 6.56, + "learning_rate": 1.9118061425753734e-05, + "loss": 0.6073, + "step": 7759, + "task_loss": 0.12004715204238892 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4780018925666809, + "epoch": 6.56, + "learning_rate": 1.9113365267211424e-05, + "loss": 0.85, + "step": 7760, + "task_loss": 0.6116994619369507 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5920121073722839, + "epoch": 6.56, + "learning_rate": 1.9108669108669107e-05, + "loss": 0.77, + "step": 7761, + "task_loss": 0.8503890633583069 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6518872976303101, + "epoch": 6.56, + "learning_rate": 1.9103972950126796e-05, + "loss": 0.576, + "step": 7762, + "task_loss": 0.5870639681816101 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8605935573577881, + "epoch": 6.56, + "learning_rate": 1.9099276791584486e-05, + "loss": 0.7268, + "step": 7763, + "task_loss": 1.2809977531433105 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5789771676063538, + "epoch": 6.56, + "learning_rate": 1.9094580633042173e-05, + "loss": 0.587, + "step": 7764, + "task_loss": 0.8626643419265747 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7379387021064758, + "epoch": 6.56, + "learning_rate": 1.9089884474499862e-05, + "loss": 0.9264, + "step": 7765, + "task_loss": 1.3871058225631714 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.540118396282196, + "epoch": 6.56, + "learning_rate": 1.908518831595755e-05, + "loss": 0.7965, + "step": 7766, + "task_loss": 0.28565290570259094 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.2509283423423767, + "epoch": 6.57, + "learning_rate": 1.9080492157415235e-05, + "loss": 0.6232, + "step": 7767, + "task_loss": 0.2952711582183838 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.854260265827179, + "epoch": 6.57, + "learning_rate": 1.907579599887292e-05, + "loss": 0.7189, + "step": 7768, + "task_loss": 0.3654499053955078 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3097617030143738, + "epoch": 6.57, + "learning_rate": 1.907109984033061e-05, + "loss": 0.4707, + "step": 7769, + "task_loss": 0.02215125411748886 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.827308177947998, + "epoch": 6.57, + "learning_rate": 1.9066403681788297e-05, + "loss": 0.7255, + "step": 7770, + "task_loss": 0.5571097135543823 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9165446162223816, + "epoch": 6.57, + "learning_rate": 1.9061707523245987e-05, + "loss": 0.7839, + "step": 7771, + "task_loss": 0.8263823390007019 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5890278816223145, + "epoch": 6.57, + "learning_rate": 1.9057011364703673e-05, + "loss": 0.583, + "step": 7772, + "task_loss": 0.9413571953773499 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.4701464176177979, + "epoch": 6.57, + "learning_rate": 1.905231520616136e-05, + "loss": 0.952, + "step": 7773, + "task_loss": 1.3535890579223633 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5097281336784363, + "epoch": 6.57, + "learning_rate": 1.9047619047619046e-05, + "loss": 0.7686, + "step": 7774, + "task_loss": 0.38886770606040955 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8099279999732971, + "epoch": 6.57, + "learning_rate": 1.9042922889076736e-05, + "loss": 0.6396, + "step": 7775, + "task_loss": 1.0725030899047852 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6936069130897522, + "epoch": 6.57, + "learning_rate": 1.9038226730534422e-05, + "loss": 0.6865, + "step": 7776, + "task_loss": 0.539017915725708 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8883570432662964, + "epoch": 6.57, + "learning_rate": 1.9033530571992112e-05, + "loss": 0.649, + "step": 7777, + "task_loss": 1.0920653343200684 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4825749695301056, + "epoch": 6.57, + "learning_rate": 1.90288344134498e-05, + "loss": 0.6692, + "step": 7778, + "task_loss": 2.15434193611145 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5092694759368896, + "epoch": 6.58, + "learning_rate": 1.9024138254907488e-05, + "loss": 0.6297, + "step": 7779, + "task_loss": 0.6431339383125305 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8136820197105408, + "epoch": 6.58, + "learning_rate": 1.9019442096365174e-05, + "loss": 0.597, + "step": 7780, + "task_loss": 0.6277098059654236 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5907455086708069, + "epoch": 6.58, + "learning_rate": 1.901474593782286e-05, + "loss": 0.7354, + "step": 7781, + "task_loss": 0.5234063267707825 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.46702224016189575, + "epoch": 6.58, + "learning_rate": 1.901004977928055e-05, + "loss": 0.7372, + "step": 7782, + "task_loss": 0.35114461183547974 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6381715536117554, + "epoch": 6.58, + "learning_rate": 1.9005353620738237e-05, + "loss": 0.5827, + "step": 7783, + "task_loss": 1.1956830024719238 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8229279518127441, + "epoch": 6.58, + "learning_rate": 1.9000657462195926e-05, + "loss": 0.899, + "step": 7784, + "task_loss": 1.060242772102356 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5737509727478027, + "epoch": 6.58, + "learning_rate": 1.8995961303653613e-05, + "loss": 0.4824, + "step": 7785, + "task_loss": 0.5266300439834595 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6548047065734863, + "epoch": 6.58, + "learning_rate": 1.89912651451113e-05, + "loss": 0.8374, + "step": 7786, + "task_loss": 0.404236763715744 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0868611335754395, + "epoch": 6.58, + "learning_rate": 1.8986568986568985e-05, + "loss": 0.7494, + "step": 7787, + "task_loss": 1.570975661277771 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8569380640983582, + "epoch": 6.58, + "learning_rate": 1.8981872828026675e-05, + "loss": 0.7416, + "step": 7788, + "task_loss": 1.113991618156433 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7101219892501831, + "epoch": 6.58, + "learning_rate": 1.897717666948436e-05, + "loss": 0.5741, + "step": 7789, + "task_loss": 0.683138370513916 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.89030522108078, + "epoch": 6.58, + "learning_rate": 1.897248051094205e-05, + "loss": 0.7974, + "step": 7790, + "task_loss": 0.7947936654090881 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6993842124938965, + "epoch": 6.59, + "learning_rate": 1.8967784352399738e-05, + "loss": 0.7358, + "step": 7791, + "task_loss": 1.230448842048645 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.3188402652740479, + "epoch": 6.59, + "learning_rate": 1.8963088193857424e-05, + "loss": 0.8977, + "step": 7792, + "task_loss": 1.454193115234375 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.057513952255249, + "epoch": 6.59, + "learning_rate": 1.895839203531511e-05, + "loss": 0.6674, + "step": 7793, + "task_loss": 0.6343326568603516 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.806682825088501, + "epoch": 6.59, + "learning_rate": 1.89536958767728e-05, + "loss": 0.6579, + "step": 7794, + "task_loss": 0.5346019268035889 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5299502611160278, + "epoch": 6.59, + "learning_rate": 1.894899971823049e-05, + "loss": 0.7228, + "step": 7795, + "task_loss": 0.6528446674346924 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6716800928115845, + "epoch": 6.59, + "learning_rate": 1.8944303559688176e-05, + "loss": 0.667, + "step": 7796, + "task_loss": 0.8172768950462341 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.49140918254852295, + "epoch": 6.59, + "learning_rate": 1.8939607401145866e-05, + "loss": 0.5185, + "step": 7797, + "task_loss": 0.305518239736557 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6331618428230286, + "epoch": 6.59, + "learning_rate": 1.8934911242603552e-05, + "loss": 0.6601, + "step": 7798, + "task_loss": 0.5063621401786804 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3184168338775635, + "epoch": 6.59, + "learning_rate": 1.893021508406124e-05, + "loss": 0.677, + "step": 7799, + "task_loss": 0.48692333698272705 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8533059358596802, + "epoch": 6.59, + "learning_rate": 1.8925518925518925e-05, + "loss": 0.7331, + "step": 7800, + "task_loss": 0.33693423867225647 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.836164116859436, + "epoch": 6.59, + "learning_rate": 1.8920822766976615e-05, + "loss": 0.721, + "step": 7801, + "task_loss": 0.9489198923110962 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7165403962135315, + "epoch": 6.59, + "learning_rate": 1.89161266084343e-05, + "loss": 0.8088, + "step": 7802, + "task_loss": 1.5076020956039429 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8273138999938965, + "epoch": 6.6, + "learning_rate": 1.891143044989199e-05, + "loss": 0.7999, + "step": 7803, + "task_loss": 1.6975314617156982 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7289086580276489, + "epoch": 6.6, + "learning_rate": 1.8906734291349677e-05, + "loss": 0.7288, + "step": 7804, + "task_loss": 0.8381226062774658 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7704528570175171, + "epoch": 6.6, + "learning_rate": 1.8902038132807363e-05, + "loss": 0.9323, + "step": 7805, + "task_loss": 1.7940747737884521 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.256895899772644, + "epoch": 6.6, + "learning_rate": 1.889734197426505e-05, + "loss": 0.9111, + "step": 7806, + "task_loss": 1.6797608137130737 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7859238982200623, + "epoch": 6.6, + "learning_rate": 1.889264581572274e-05, + "loss": 0.7906, + "step": 7807, + "task_loss": 1.5870091915130615 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0073221921920776, + "epoch": 6.6, + "learning_rate": 1.8887949657180426e-05, + "loss": 0.5878, + "step": 7808, + "task_loss": 0.6206783652305603 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6592059135437012, + "epoch": 6.6, + "learning_rate": 1.8883253498638115e-05, + "loss": 0.5486, + "step": 7809, + "task_loss": 1.5339548587799072 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.2701926231384277, + "epoch": 6.6, + "learning_rate": 1.8878557340095805e-05, + "loss": 0.856, + "step": 7810, + "task_loss": 0.8669659495353699 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7515615224838257, + "epoch": 6.6, + "learning_rate": 1.887386118155349e-05, + "loss": 0.72, + "step": 7811, + "task_loss": 1.1863491535186768 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5042928457260132, + "epoch": 6.6, + "learning_rate": 1.8869165023011178e-05, + "loss": 0.6659, + "step": 7812, + "task_loss": 0.691389799118042 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8114071488380432, + "epoch": 6.6, + "learning_rate": 1.8864468864468864e-05, + "loss": 0.7502, + "step": 7813, + "task_loss": 0.3721247911453247 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.38322579860687256, + "epoch": 6.6, + "learning_rate": 1.8859772705926554e-05, + "loss": 0.6483, + "step": 7814, + "task_loss": 0.061326153576374054 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5933911204338074, + "epoch": 6.61, + "learning_rate": 1.885507654738424e-05, + "loss": 0.7209, + "step": 7815, + "task_loss": 1.241695523262024 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5378559231758118, + "epoch": 6.61, + "learning_rate": 1.885038038884193e-05, + "loss": 0.6551, + "step": 7816, + "task_loss": 0.23212647438049316 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.746422290802002, + "epoch": 6.61, + "learning_rate": 1.8845684230299616e-05, + "loss": 0.646, + "step": 7817, + "task_loss": 1.8283711671829224 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.47464901208877563, + "epoch": 6.61, + "learning_rate": 1.8840988071757303e-05, + "loss": 0.6529, + "step": 7818, + "task_loss": 0.46652907133102417 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8120818138122559, + "epoch": 6.61, + "learning_rate": 1.883629191321499e-05, + "loss": 0.6, + "step": 7819, + "task_loss": 0.4492277204990387 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6828403472900391, + "epoch": 6.61, + "learning_rate": 1.883159575467268e-05, + "loss": 0.6923, + "step": 7820, + "task_loss": 0.9340840578079224 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7717525959014893, + "epoch": 6.61, + "learning_rate": 1.8826899596130365e-05, + "loss": 0.7275, + "step": 7821, + "task_loss": 0.6896119713783264 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7674301266670227, + "epoch": 6.61, + "learning_rate": 1.8822203437588055e-05, + "loss": 0.7601, + "step": 7822, + "task_loss": 1.0302557945251465 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6335337162017822, + "epoch": 6.61, + "learning_rate": 1.881750727904574e-05, + "loss": 0.7603, + "step": 7823, + "task_loss": 0.5713998675346375 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8188178539276123, + "epoch": 6.61, + "learning_rate": 1.8812811120503427e-05, + "loss": 0.6082, + "step": 7824, + "task_loss": 1.0623761415481567 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.28127920627593994, + "epoch": 6.61, + "learning_rate": 1.8808114961961117e-05, + "loss": 0.4432, + "step": 7825, + "task_loss": 0.21630476415157318 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6065192222595215, + "epoch": 6.61, + "learning_rate": 1.8803418803418804e-05, + "loss": 0.6879, + "step": 7826, + "task_loss": 1.3084105253219604 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9372395277023315, + "epoch": 6.62, + "learning_rate": 1.8798722644876493e-05, + "loss": 0.8074, + "step": 7827, + "task_loss": 0.8304434418678284 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8588280081748962, + "epoch": 6.62, + "learning_rate": 1.879402648633418e-05, + "loss": 0.9756, + "step": 7828, + "task_loss": 1.5051504373550415 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8606330156326294, + "epoch": 6.62, + "learning_rate": 1.878933032779187e-05, + "loss": 0.8129, + "step": 7829, + "task_loss": 1.0700019598007202 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6594505310058594, + "epoch": 6.62, + "learning_rate": 1.8784634169249556e-05, + "loss": 0.7928, + "step": 7830, + "task_loss": 0.47969910502433777 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7884535789489746, + "epoch": 6.62, + "learning_rate": 1.8779938010707242e-05, + "loss": 0.6013, + "step": 7831, + "task_loss": 0.9770629405975342 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.2070974111557007, + "epoch": 6.62, + "learning_rate": 1.877524185216493e-05, + "loss": 0.7962, + "step": 7832, + "task_loss": 0.6993933320045471 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.40967270731925964, + "epoch": 6.62, + "learning_rate": 1.8770545693622618e-05, + "loss": 0.6595, + "step": 7833, + "task_loss": 0.7899876236915588 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7396615147590637, + "epoch": 6.62, + "learning_rate": 1.8765849535080304e-05, + "loss": 0.954, + "step": 7834, + "task_loss": 0.5316251516342163 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.68337082862854, + "epoch": 6.62, + "learning_rate": 1.8761153376537994e-05, + "loss": 0.7433, + "step": 7835, + "task_loss": 1.0057151317596436 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.43260130286216736, + "epoch": 6.62, + "learning_rate": 1.875645721799568e-05, + "loss": 0.6786, + "step": 7836, + "task_loss": 0.7278441190719604 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9949042797088623, + "epoch": 6.62, + "learning_rate": 1.8751761059453367e-05, + "loss": 0.7617, + "step": 7837, + "task_loss": 0.6457927823066711 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6649297475814819, + "epoch": 6.63, + "learning_rate": 1.8747064900911053e-05, + "loss": 0.6905, + "step": 7838, + "task_loss": 0.4780188500881195 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7206320762634277, + "epoch": 6.63, + "learning_rate": 1.8742368742368743e-05, + "loss": 0.8024, + "step": 7839, + "task_loss": 1.5674070119857788 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.151001214981079, + "epoch": 6.63, + "learning_rate": 1.8737672583826433e-05, + "loss": 0.7241, + "step": 7840, + "task_loss": 1.2793731689453125 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.22347110509872437, + "epoch": 6.63, + "learning_rate": 1.873297642528412e-05, + "loss": 0.5819, + "step": 7841, + "task_loss": 0.0804326981306076 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0909905433654785, + "epoch": 6.63, + "learning_rate": 1.872828026674181e-05, + "loss": 0.8094, + "step": 7842, + "task_loss": 1.0655397176742554 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.725278377532959, + "epoch": 6.63, + "learning_rate": 1.8723584108199495e-05, + "loss": 0.7924, + "step": 7843, + "task_loss": 0.9503967761993408 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.981561005115509, + "epoch": 6.63, + "learning_rate": 1.871888794965718e-05, + "loss": 0.7637, + "step": 7844, + "task_loss": 1.4444925785064697 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6943938732147217, + "epoch": 6.63, + "learning_rate": 1.8714191791114868e-05, + "loss": 0.8357, + "step": 7845, + "task_loss": 1.0910625457763672 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.886143684387207, + "epoch": 6.63, + "learning_rate": 1.8709495632572557e-05, + "loss": 0.9499, + "step": 7846, + "task_loss": 2.3945388793945312 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9058287143707275, + "epoch": 6.63, + "learning_rate": 1.8704799474030244e-05, + "loss": 0.7592, + "step": 7847, + "task_loss": 0.6217558979988098 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6692575216293335, + "epoch": 6.63, + "learning_rate": 1.8700103315487934e-05, + "loss": 0.6718, + "step": 7848, + "task_loss": 0.6496251225471497 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.695428729057312, + "epoch": 6.63, + "learning_rate": 1.869540715694562e-05, + "loss": 0.7254, + "step": 7849, + "task_loss": 0.7545936703681946 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5767747759819031, + "epoch": 6.64, + "learning_rate": 1.8690710998403306e-05, + "loss": 0.549, + "step": 7850, + "task_loss": 1.1683772802352905 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.473606824874878, + "epoch": 6.64, + "learning_rate": 1.8686014839860993e-05, + "loss": 0.8785, + "step": 7851, + "task_loss": 0.6885291934013367 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7994762659072876, + "epoch": 6.64, + "learning_rate": 1.8681318681318682e-05, + "loss": 0.7681, + "step": 7852, + "task_loss": 0.9728848338127136 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.3819310665130615, + "epoch": 6.64, + "learning_rate": 1.867662252277637e-05, + "loss": 0.8053, + "step": 7853, + "task_loss": 1.0391877889633179 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.49070021510124207, + "epoch": 6.64, + "learning_rate": 1.867192636423406e-05, + "loss": 0.7229, + "step": 7854, + "task_loss": 0.10621807724237442 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.667739748954773, + "epoch": 6.64, + "learning_rate": 1.8667230205691748e-05, + "loss": 0.5962, + "step": 7855, + "task_loss": 0.7288939952850342 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7713451385498047, + "epoch": 6.64, + "learning_rate": 1.866253404714943e-05, + "loss": 0.8561, + "step": 7856, + "task_loss": 1.4917329549789429 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9241687059402466, + "epoch": 6.64, + "learning_rate": 1.865783788860712e-05, + "loss": 0.6508, + "step": 7857, + "task_loss": 0.5608865022659302 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6861218810081482, + "epoch": 6.64, + "learning_rate": 1.8653141730064807e-05, + "loss": 0.6652, + "step": 7858, + "task_loss": 0.2603943347930908 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.1871118545532227, + "epoch": 6.64, + "learning_rate": 1.8648445571522497e-05, + "loss": 0.8852, + "step": 7859, + "task_loss": 1.111939787864685 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.410509318113327, + "epoch": 6.64, + "learning_rate": 1.8643749412980183e-05, + "loss": 0.5672, + "step": 7860, + "task_loss": 0.538434624671936 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3582940995693207, + "epoch": 6.64, + "learning_rate": 1.8639053254437873e-05, + "loss": 0.6643, + "step": 7861, + "task_loss": 0.03490319475531578 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9138627052307129, + "epoch": 6.65, + "learning_rate": 1.863435709589556e-05, + "loss": 0.758, + "step": 7862, + "task_loss": 0.8516718745231628 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4916948676109314, + "epoch": 6.65, + "learning_rate": 1.8629660937353246e-05, + "loss": 0.6038, + "step": 7863, + "task_loss": 0.909015953540802 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7165804505348206, + "epoch": 6.65, + "learning_rate": 1.8624964778810932e-05, + "loss": 0.5611, + "step": 7864, + "task_loss": 0.69620680809021 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0723464488983154, + "epoch": 6.65, + "learning_rate": 1.862026862026862e-05, + "loss": 0.7505, + "step": 7865, + "task_loss": 1.4382755756378174 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3903179168701172, + "epoch": 6.65, + "learning_rate": 1.8615572461726308e-05, + "loss": 0.6346, + "step": 7866, + "task_loss": 0.2761421203613281 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7648067474365234, + "epoch": 6.65, + "learning_rate": 1.8610876303183998e-05, + "loss": 0.6708, + "step": 7867, + "task_loss": 1.2758221626281738 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6692013740539551, + "epoch": 6.65, + "learning_rate": 1.8606180144641684e-05, + "loss": 0.7081, + "step": 7868, + "task_loss": 0.6584583520889282 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3371739685535431, + "epoch": 6.65, + "learning_rate": 1.860148398609937e-05, + "loss": 0.6335, + "step": 7869, + "task_loss": 0.6395004987716675 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4584331512451172, + "epoch": 6.65, + "learning_rate": 1.8596787827557057e-05, + "loss": 0.5971, + "step": 7870, + "task_loss": 1.3012323379516602 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9976335167884827, + "epoch": 6.65, + "learning_rate": 1.8592091669014746e-05, + "loss": 0.8223, + "step": 7871, + "task_loss": 1.2253007888793945 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4941475987434387, + "epoch": 6.65, + "learning_rate": 1.8587395510472436e-05, + "loss": 0.7129, + "step": 7872, + "task_loss": 1.3460168838500977 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8650755286216736, + "epoch": 6.65, + "learning_rate": 1.8582699351930122e-05, + "loss": 0.6712, + "step": 7873, + "task_loss": 1.1510897874832153 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9519252777099609, + "epoch": 6.66, + "learning_rate": 1.8578003193387812e-05, + "loss": 0.7787, + "step": 7874, + "task_loss": 0.6160178184509277 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9744563102722168, + "epoch": 6.66, + "learning_rate": 1.8573307034845495e-05, + "loss": 0.8978, + "step": 7875, + "task_loss": 0.7912254929542542 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5862891674041748, + "epoch": 6.66, + "learning_rate": 1.8568610876303185e-05, + "loss": 0.6286, + "step": 7876, + "task_loss": 0.17454218864440918 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.847195565700531, + "epoch": 6.66, + "learning_rate": 1.856391471776087e-05, + "loss": 0.8001, + "step": 7877, + "task_loss": 1.1676759719848633 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.19663405418396, + "epoch": 6.66, + "learning_rate": 1.855921855921856e-05, + "loss": 0.8, + "step": 7878, + "task_loss": 1.4962078332901 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9336768388748169, + "epoch": 6.66, + "learning_rate": 1.8554522400676247e-05, + "loss": 0.6937, + "step": 7879, + "task_loss": 1.0022307634353638 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7022480964660645, + "epoch": 6.66, + "learning_rate": 1.8549826242133937e-05, + "loss": 0.6243, + "step": 7880, + "task_loss": 0.301519513130188 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7668818831443787, + "epoch": 6.66, + "learning_rate": 1.8545130083591623e-05, + "loss": 0.7655, + "step": 7881, + "task_loss": 0.9754584431648254 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6367032527923584, + "epoch": 6.66, + "learning_rate": 1.854043392504931e-05, + "loss": 0.6221, + "step": 7882, + "task_loss": 0.3960645794868469 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5827299356460571, + "epoch": 6.66, + "learning_rate": 1.8535737766506996e-05, + "loss": 0.6088, + "step": 7883, + "task_loss": 0.381015419960022 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.1089345216751099, + "epoch": 6.66, + "learning_rate": 1.8531041607964686e-05, + "loss": 0.8348, + "step": 7884, + "task_loss": 0.7361460328102112 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6794453859329224, + "epoch": 6.66, + "learning_rate": 1.8526345449422372e-05, + "loss": 0.9032, + "step": 7885, + "task_loss": 1.330956220626831 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7649685144424438, + "epoch": 6.67, + "learning_rate": 1.8521649290880062e-05, + "loss": 0.6831, + "step": 7886, + "task_loss": 1.0181002616882324 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4819204807281494, + "epoch": 6.67, + "learning_rate": 1.8516953132337748e-05, + "loss": 0.766, + "step": 7887, + "task_loss": 1.3499748706817627 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5072662830352783, + "epoch": 6.67, + "learning_rate": 1.8512256973795435e-05, + "loss": 0.676, + "step": 7888, + "task_loss": 0.8055269122123718 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9185647368431091, + "epoch": 6.67, + "learning_rate": 1.8507560815253124e-05, + "loss": 0.8706, + "step": 7889, + "task_loss": 1.2536611557006836 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.46412649750709534, + "epoch": 6.67, + "learning_rate": 1.850286465671081e-05, + "loss": 0.5188, + "step": 7890, + "task_loss": 1.0379188060760498 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.36142587661743164, + "epoch": 6.67, + "learning_rate": 1.84981684981685e-05, + "loss": 0.4808, + "step": 7891, + "task_loss": 0.23987053334712982 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8084214329719543, + "epoch": 6.67, + "learning_rate": 1.8493472339626187e-05, + "loss": 0.6773, + "step": 7892, + "task_loss": 0.5548166632652283 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4335891604423523, + "epoch": 6.67, + "learning_rate": 1.8488776181083876e-05, + "loss": 0.6865, + "step": 7893, + "task_loss": 1.1114609241485596 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5552604794502258, + "epoch": 6.67, + "learning_rate": 1.8484080022541563e-05, + "loss": 0.7126, + "step": 7894, + "task_loss": 1.2031651735305786 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6247009038925171, + "epoch": 6.67, + "learning_rate": 1.847938386399925e-05, + "loss": 0.5654, + "step": 7895, + "task_loss": 0.7995830178260803 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.33611488342285156, + "epoch": 6.67, + "learning_rate": 1.8474687705456935e-05, + "loss": 0.6386, + "step": 7896, + "task_loss": 0.6066705584526062 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.009952187538147, + "epoch": 6.67, + "learning_rate": 1.8469991546914625e-05, + "loss": 0.9248, + "step": 7897, + "task_loss": 1.1409581899642944 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8554931282997131, + "epoch": 6.68, + "learning_rate": 1.846529538837231e-05, + "loss": 0.7365, + "step": 7898, + "task_loss": 0.7813534736633301 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6962049007415771, + "epoch": 6.68, + "learning_rate": 1.846059922983e-05, + "loss": 0.7716, + "step": 7899, + "task_loss": 1.0813212394714355 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3811628818511963, + "epoch": 6.68, + "learning_rate": 1.8455903071287688e-05, + "loss": 0.5666, + "step": 7900, + "task_loss": 0.3818693459033966 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8189781904220581, + "epoch": 6.68, + "learning_rate": 1.8451206912745374e-05, + "loss": 0.6425, + "step": 7901, + "task_loss": 0.6894546151161194 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8266512751579285, + "epoch": 6.68, + "learning_rate": 1.8446510754203064e-05, + "loss": 0.6748, + "step": 7902, + "task_loss": 0.8933721780776978 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3638013005256653, + "epoch": 6.68, + "learning_rate": 1.844181459566075e-05, + "loss": 0.5379, + "step": 7903, + "task_loss": 0.7172104120254517 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7087752819061279, + "epoch": 6.68, + "learning_rate": 1.843711843711844e-05, + "loss": 0.9135, + "step": 7904, + "task_loss": 1.932684063911438 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.48375391960144043, + "epoch": 6.68, + "learning_rate": 1.8432422278576126e-05, + "loss": 0.7564, + "step": 7905, + "task_loss": 1.0751687288284302 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5894186496734619, + "epoch": 6.68, + "learning_rate": 1.8427726120033816e-05, + "loss": 0.8414, + "step": 7906, + "task_loss": 0.8775391578674316 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8144789934158325, + "epoch": 6.68, + "learning_rate": 1.84230299614915e-05, + "loss": 0.5902, + "step": 7907, + "task_loss": 0.7461346983909607 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0533013343811035, + "epoch": 6.68, + "learning_rate": 1.841833380294919e-05, + "loss": 0.6641, + "step": 7908, + "task_loss": 0.40353822708129883 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6165851354598999, + "epoch": 6.69, + "learning_rate": 1.8413637644406875e-05, + "loss": 0.7678, + "step": 7909, + "task_loss": 0.6708202958106995 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6888418793678284, + "epoch": 6.69, + "learning_rate": 1.8408941485864564e-05, + "loss": 0.6151, + "step": 7910, + "task_loss": 0.9505549669265747 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4211847484111786, + "epoch": 6.69, + "learning_rate": 1.840424532732225e-05, + "loss": 0.5525, + "step": 7911, + "task_loss": 0.32170945405960083 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6918339133262634, + "epoch": 6.69, + "learning_rate": 1.839954916877994e-05, + "loss": 0.6341, + "step": 7912, + "task_loss": 0.8993133902549744 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8506077527999878, + "epoch": 6.69, + "learning_rate": 1.8394853010237627e-05, + "loss": 0.6666, + "step": 7913, + "task_loss": 0.5703252553939819 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6750425100326538, + "epoch": 6.69, + "learning_rate": 1.8390156851695313e-05, + "loss": 0.8671, + "step": 7914, + "task_loss": 0.9089783430099487 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.574044942855835, + "epoch": 6.69, + "learning_rate": 1.8385460693153e-05, + "loss": 0.5112, + "step": 7915, + "task_loss": 1.2618130445480347 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5041455030441284, + "epoch": 6.69, + "learning_rate": 1.838076453461069e-05, + "loss": 0.6554, + "step": 7916, + "task_loss": 0.8494638204574585 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7480868697166443, + "epoch": 6.69, + "learning_rate": 1.837606837606838e-05, + "loss": 0.848, + "step": 7917, + "task_loss": 0.3841010630130768 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.37854355573654175, + "epoch": 6.69, + "learning_rate": 1.8371372217526065e-05, + "loss": 0.7072, + "step": 7918, + "task_loss": 0.0911397933959961 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8841831684112549, + "epoch": 6.69, + "learning_rate": 1.8366676058983752e-05, + "loss": 0.689, + "step": 7919, + "task_loss": 0.7403742671012878 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7463845014572144, + "epoch": 6.69, + "learning_rate": 1.8361979900441438e-05, + "loss": 0.7868, + "step": 7920, + "task_loss": 0.7863142490386963 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.1865637302398682, + "epoch": 6.7, + "learning_rate": 1.8357283741899128e-05, + "loss": 0.8119, + "step": 7921, + "task_loss": 1.0542932748794556 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8660508990287781, + "epoch": 6.7, + "learning_rate": 1.8352587583356814e-05, + "loss": 0.7222, + "step": 7922, + "task_loss": 0.2783169150352478 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7154989838600159, + "epoch": 6.7, + "learning_rate": 1.8347891424814504e-05, + "loss": 0.6366, + "step": 7923, + "task_loss": 0.7387536764144897 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8136405944824219, + "epoch": 6.7, + "learning_rate": 1.834319526627219e-05, + "loss": 0.7286, + "step": 7924, + "task_loss": 0.4507836699485779 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6038713455200195, + "epoch": 6.7, + "learning_rate": 1.833849910772988e-05, + "loss": 0.5986, + "step": 7925, + "task_loss": 0.5910147428512573 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.534325122833252, + "epoch": 6.7, + "learning_rate": 1.8333802949187566e-05, + "loss": 0.6766, + "step": 7926, + "task_loss": 0.6559394598007202 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7273287773132324, + "epoch": 6.7, + "learning_rate": 1.8329106790645253e-05, + "loss": 0.5199, + "step": 7927, + "task_loss": 0.5171031951904297 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5430275797843933, + "epoch": 6.7, + "learning_rate": 1.832441063210294e-05, + "loss": 0.6632, + "step": 7928, + "task_loss": 0.7681781649589539 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9331591129302979, + "epoch": 6.7, + "learning_rate": 1.831971447356063e-05, + "loss": 0.7638, + "step": 7929, + "task_loss": 0.8586137294769287 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.574021577835083, + "epoch": 6.7, + "learning_rate": 1.8315018315018315e-05, + "loss": 1.0647, + "step": 7930, + "task_loss": 0.9498838186264038 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.48796409368515015, + "epoch": 6.7, + "learning_rate": 1.8310322156476005e-05, + "loss": 0.6966, + "step": 7931, + "task_loss": 0.5110630393028259 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6608863472938538, + "epoch": 6.7, + "learning_rate": 1.830562599793369e-05, + "loss": 0.8187, + "step": 7932, + "task_loss": 1.179255723953247 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5754683017730713, + "epoch": 6.71, + "learning_rate": 1.8300929839391377e-05, + "loss": 0.6756, + "step": 7933, + "task_loss": 0.7088960409164429 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.634117603302002, + "epoch": 6.71, + "learning_rate": 1.8296233680849067e-05, + "loss": 0.7904, + "step": 7934, + "task_loss": 0.4011755883693695 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6617614030838013, + "epoch": 6.71, + "learning_rate": 1.8291537522306753e-05, + "loss": 0.7491, + "step": 7935, + "task_loss": 1.725034475326538 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7297960519790649, + "epoch": 6.71, + "learning_rate": 1.8286841363764443e-05, + "loss": 0.6529, + "step": 7936, + "task_loss": 1.0573828220367432 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9059305787086487, + "epoch": 6.71, + "learning_rate": 1.828214520522213e-05, + "loss": 0.8028, + "step": 7937, + "task_loss": 1.1649330854415894 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.659383237361908, + "epoch": 6.71, + "learning_rate": 1.827744904667982e-05, + "loss": 0.544, + "step": 7938, + "task_loss": 0.113300621509552 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0751678943634033, + "epoch": 6.71, + "learning_rate": 1.8272752888137502e-05, + "loss": 0.5793, + "step": 7939, + "task_loss": 1.0219407081604004 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.1095887422561646, + "epoch": 6.71, + "learning_rate": 1.8268056729595192e-05, + "loss": 0.8892, + "step": 7940, + "task_loss": 1.1047041416168213 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7027661204338074, + "epoch": 6.71, + "learning_rate": 1.8263360571052878e-05, + "loss": 0.8473, + "step": 7941, + "task_loss": 1.01369047164917 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3772653043270111, + "epoch": 6.71, + "learning_rate": 1.8258664412510568e-05, + "loss": 0.405, + "step": 7942, + "task_loss": 0.8380016088485718 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6938070058822632, + "epoch": 6.71, + "learning_rate": 1.8253968253968254e-05, + "loss": 0.5531, + "step": 7943, + "task_loss": 0.5890017151832581 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.39807236194610596, + "epoch": 6.71, + "learning_rate": 1.8249272095425944e-05, + "loss": 0.8595, + "step": 7944, + "task_loss": 0.29542940855026245 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6868453025817871, + "epoch": 6.72, + "learning_rate": 1.824457593688363e-05, + "loss": 0.8595, + "step": 7945, + "task_loss": 0.6749566197395325 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4321601688861847, + "epoch": 6.72, + "learning_rate": 1.8239879778341317e-05, + "loss": 0.6605, + "step": 7946, + "task_loss": 0.5932649374008179 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.47573405504226685, + "epoch": 6.72, + "learning_rate": 1.8235183619799003e-05, + "loss": 0.6114, + "step": 7947, + "task_loss": 0.36540669202804565 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.179674744606018, + "epoch": 6.72, + "learning_rate": 1.8230487461256693e-05, + "loss": 0.6964, + "step": 7948, + "task_loss": 1.3381692171096802 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.27693408727645874, + "epoch": 6.72, + "learning_rate": 1.8225791302714383e-05, + "loss": 0.4121, + "step": 7949, + "task_loss": 0.34034907817840576 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.45274123549461365, + "epoch": 6.72, + "learning_rate": 1.822109514417207e-05, + "loss": 0.6604, + "step": 7950, + "task_loss": 0.7094851136207581 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.86525559425354, + "epoch": 6.72, + "learning_rate": 1.8216398985629755e-05, + "loss": 0.8097, + "step": 7951, + "task_loss": 1.4980319738388062 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7549800276756287, + "epoch": 6.72, + "learning_rate": 1.821170282708744e-05, + "loss": 0.598, + "step": 7952, + "task_loss": 0.9671727418899536 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7765780687332153, + "epoch": 6.72, + "learning_rate": 1.820700666854513e-05, + "loss": 0.7074, + "step": 7953, + "task_loss": 0.9649078845977783 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.637029230594635, + "epoch": 6.72, + "learning_rate": 1.8202310510002818e-05, + "loss": 0.6844, + "step": 7954, + "task_loss": 0.25865480303764343 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8664615154266357, + "epoch": 6.72, + "learning_rate": 1.8197614351460507e-05, + "loss": 0.6513, + "step": 7955, + "task_loss": 0.8578440546989441 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.41346997022628784, + "epoch": 6.72, + "learning_rate": 1.8192918192918194e-05, + "loss": 0.5843, + "step": 7956, + "task_loss": 0.6374626755714417 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.27111926674842834, + "epoch": 6.73, + "learning_rate": 1.8188222034375883e-05, + "loss": 0.8302, + "step": 7957, + "task_loss": 0.6497591733932495 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6577046513557434, + "epoch": 6.73, + "learning_rate": 1.8183525875833566e-05, + "loss": 0.6675, + "step": 7958, + "task_loss": 0.8724526762962341 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6831278800964355, + "epoch": 6.73, + "learning_rate": 1.8178829717291256e-05, + "loss": 0.592, + "step": 7959, + "task_loss": 1.1061406135559082 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4870046377182007, + "epoch": 6.73, + "learning_rate": 1.8174133558748942e-05, + "loss": 0.482, + "step": 7960, + "task_loss": 0.828632652759552 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.694338321685791, + "epoch": 6.73, + "learning_rate": 1.8169437400206632e-05, + "loss": 0.7124, + "step": 7961, + "task_loss": 0.6959808468818665 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7595100998878479, + "epoch": 6.73, + "learning_rate": 1.816474124166432e-05, + "loss": 0.7484, + "step": 7962, + "task_loss": 0.2699923515319824 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6249562501907349, + "epoch": 6.73, + "learning_rate": 1.8160045083122008e-05, + "loss": 0.4847, + "step": 7963, + "task_loss": 0.25175535678863525 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0113379955291748, + "epoch": 6.73, + "learning_rate": 1.8155348924579695e-05, + "loss": 0.664, + "step": 7964, + "task_loss": 0.7499567270278931 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6708813905715942, + "epoch": 6.73, + "learning_rate": 1.815065276603738e-05, + "loss": 0.6245, + "step": 7965, + "task_loss": 0.8841532468795776 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.45417991280555725, + "epoch": 6.73, + "learning_rate": 1.814595660749507e-05, + "loss": 0.794, + "step": 7966, + "task_loss": 0.6017684936523438 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9415045380592346, + "epoch": 6.73, + "learning_rate": 1.8141260448952757e-05, + "loss": 0.6708, + "step": 7967, + "task_loss": 0.810107409954071 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5682117938995361, + "epoch": 6.73, + "learning_rate": 1.8136564290410447e-05, + "loss": 0.7, + "step": 7968, + "task_loss": 0.8896453976631165 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7057149410247803, + "epoch": 6.74, + "learning_rate": 1.8131868131868133e-05, + "loss": 0.5911, + "step": 7969, + "task_loss": 1.0541017055511475 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3720572590827942, + "epoch": 6.74, + "learning_rate": 1.812717197332582e-05, + "loss": 0.5753, + "step": 7970, + "task_loss": 0.5229494571685791 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.26686209440231323, + "epoch": 6.74, + "learning_rate": 1.8122475814783506e-05, + "loss": 0.5754, + "step": 7971, + "task_loss": 0.39854317903518677 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7378036975860596, + "epoch": 6.74, + "learning_rate": 1.8117779656241195e-05, + "loss": 0.7868, + "step": 7972, + "task_loss": 0.35473713278770447 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7486150860786438, + "epoch": 6.74, + "learning_rate": 1.8113083497698882e-05, + "loss": 0.6998, + "step": 7973, + "task_loss": 1.037705421447754 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7320632934570312, + "epoch": 6.74, + "learning_rate": 1.810838733915657e-05, + "loss": 0.789, + "step": 7974, + "task_loss": 0.46460893750190735 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6991386413574219, + "epoch": 6.74, + "learning_rate": 1.8103691180614258e-05, + "loss": 0.7022, + "step": 7975, + "task_loss": 0.2743189036846161 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7205047607421875, + "epoch": 6.74, + "learning_rate": 1.8098995022071948e-05, + "loss": 0.6746, + "step": 7976, + "task_loss": 0.9362558722496033 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6944296360015869, + "epoch": 6.74, + "learning_rate": 1.8094298863529634e-05, + "loss": 0.6601, + "step": 7977, + "task_loss": 0.454405814409256 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5766761302947998, + "epoch": 6.74, + "learning_rate": 1.808960270498732e-05, + "loss": 0.7658, + "step": 7978, + "task_loss": 0.8851684927940369 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.096362829208374, + "epoch": 6.74, + "learning_rate": 1.808490654644501e-05, + "loss": 0.8801, + "step": 7979, + "task_loss": 0.5858886241912842 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8550338745117188, + "epoch": 6.75, + "learning_rate": 1.8080210387902696e-05, + "loss": 0.743, + "step": 7980, + "task_loss": 0.9079235196113586 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.2198041677474976, + "epoch": 6.75, + "learning_rate": 1.8075514229360386e-05, + "loss": 0.8365, + "step": 7981, + "task_loss": 1.197420358657837 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6381044387817383, + "epoch": 6.75, + "learning_rate": 1.8070818070818072e-05, + "loss": 0.702, + "step": 7982, + "task_loss": 0.3985850512981415 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6946704387664795, + "epoch": 6.75, + "learning_rate": 1.806612191227576e-05, + "loss": 0.9497, + "step": 7983, + "task_loss": 0.697201669216156 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.23690593242645264, + "epoch": 6.75, + "learning_rate": 1.8061425753733445e-05, + "loss": 0.707, + "step": 7984, + "task_loss": 0.27074435353279114 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5232446193695068, + "epoch": 6.75, + "learning_rate": 1.8056729595191135e-05, + "loss": 0.6158, + "step": 7985, + "task_loss": 0.5616576075553894 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.515991747379303, + "epoch": 6.75, + "learning_rate": 1.805203343664882e-05, + "loss": 0.803, + "step": 7986, + "task_loss": 1.1551685333251953 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8239456415176392, + "epoch": 6.75, + "learning_rate": 1.804733727810651e-05, + "loss": 0.8247, + "step": 7987, + "task_loss": 0.693233847618103 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.751591682434082, + "epoch": 6.75, + "learning_rate": 1.8042641119564197e-05, + "loss": 0.6679, + "step": 7988, + "task_loss": 0.9313406348228455 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.038351058959961, + "epoch": 6.75, + "learning_rate": 1.8037944961021887e-05, + "loss": 0.7906, + "step": 7989, + "task_loss": 0.8524459600448608 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.1990362405776978, + "epoch": 6.75, + "learning_rate": 1.803324880247957e-05, + "loss": 0.9134, + "step": 7990, + "task_loss": 1.4911595582962036 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8936008214950562, + "epoch": 6.75, + "learning_rate": 1.802855264393726e-05, + "loss": 0.8676, + "step": 7991, + "task_loss": 0.42948809266090393 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8148795366287231, + "epoch": 6.76, + "learning_rate": 1.8023856485394946e-05, + "loss": 0.7236, + "step": 7992, + "task_loss": 0.7880675792694092 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7130094766616821, + "epoch": 6.76, + "learning_rate": 1.8019160326852636e-05, + "loss": 0.6065, + "step": 7993, + "task_loss": 0.8865887522697449 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4009692668914795, + "epoch": 6.76, + "learning_rate": 1.8014464168310325e-05, + "loss": 0.5261, + "step": 7994, + "task_loss": 0.19225730001926422 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7292789816856384, + "epoch": 6.76, + "learning_rate": 1.8009768009768012e-05, + "loss": 0.7411, + "step": 7995, + "task_loss": 1.9855096340179443 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.632171094417572, + "epoch": 6.76, + "learning_rate": 1.8005071851225698e-05, + "loss": 0.6413, + "step": 7996, + "task_loss": 0.725261926651001 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7210355997085571, + "epoch": 6.76, + "learning_rate": 1.8000375692683384e-05, + "loss": 0.6499, + "step": 7997, + "task_loss": 0.7822661995887756 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.75420081615448, + "epoch": 6.76, + "learning_rate": 1.7995679534141074e-05, + "loss": 0.6456, + "step": 7998, + "task_loss": 1.437920331954956 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9440190196037292, + "epoch": 6.76, + "learning_rate": 1.799098337559876e-05, + "loss": 0.7851, + "step": 7999, + "task_loss": 0.5991203188896179 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7499675750732422, + "epoch": 6.76, + "learning_rate": 1.798628721705645e-05, + "loss": 0.7319, + "step": 8000, + "task_loss": 0.5333141684532166 + }, + { + "epoch": 6.76, + "eval_accuracy": 0.8933069306930693, + "eval_loss": 0.4493527114391327, + "eval_runtime": 225.2937, + "eval_samples_per_second": 112.076, + "eval_steps_per_second": 0.879, + "step": 8000 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5895742177963257, + "epoch": 6.76, + "learning_rate": 1.7981591058514137e-05, + "loss": 0.8241, + "step": 8001, + "task_loss": 1.0977030992507935 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0360515117645264, + "epoch": 6.76, + "learning_rate": 1.7976894899971823e-05, + "loss": 0.6784, + "step": 8002, + "task_loss": 0.7299138903617859 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.370789647102356, + "epoch": 6.76, + "learning_rate": 1.797219874142951e-05, + "loss": 0.7494, + "step": 8003, + "task_loss": 1.210050344467163 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4126530587673187, + "epoch": 6.77, + "learning_rate": 1.79675025828872e-05, + "loss": 0.7112, + "step": 8004, + "task_loss": 0.6391266584396362 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8201828002929688, + "epoch": 6.77, + "learning_rate": 1.7962806424344885e-05, + "loss": 1.1058, + "step": 8005, + "task_loss": 1.5582082271575928 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5877934694290161, + "epoch": 6.77, + "learning_rate": 1.7958110265802575e-05, + "loss": 0.8139, + "step": 8006, + "task_loss": 1.2045164108276367 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5264323949813843, + "epoch": 6.77, + "learning_rate": 1.795341410726026e-05, + "loss": 0.6626, + "step": 8007, + "task_loss": 1.127972960472107 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.1265079975128174, + "epoch": 6.77, + "learning_rate": 1.794871794871795e-05, + "loss": 0.71, + "step": 8008, + "task_loss": 0.4674353301525116 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6414729356765747, + "epoch": 6.77, + "learning_rate": 1.7944021790175637e-05, + "loss": 0.7546, + "step": 8009, + "task_loss": 0.7259740829467773 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4719139039516449, + "epoch": 6.77, + "learning_rate": 1.7939325631633324e-05, + "loss": 0.5668, + "step": 8010, + "task_loss": 0.326391339302063 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5669357776641846, + "epoch": 6.77, + "learning_rate": 1.7934629473091014e-05, + "loss": 0.7682, + "step": 8011, + "task_loss": 0.5344647169113159 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4654301106929779, + "epoch": 6.77, + "learning_rate": 1.79299333145487e-05, + "loss": 0.6913, + "step": 8012, + "task_loss": 0.8966190218925476 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5026566982269287, + "epoch": 6.77, + "learning_rate": 1.792523715600639e-05, + "loss": 0.7439, + "step": 8013, + "task_loss": 0.5133823752403259 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.45161157846450806, + "epoch": 6.77, + "learning_rate": 1.7920540997464076e-05, + "loss": 0.4881, + "step": 8014, + "task_loss": 1.409341812133789 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6472766399383545, + "epoch": 6.77, + "learning_rate": 1.7915844838921762e-05, + "loss": 0.6912, + "step": 8015, + "task_loss": 0.7846285104751587 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8106926679611206, + "epoch": 6.78, + "learning_rate": 1.791114868037945e-05, + "loss": 0.7025, + "step": 8016, + "task_loss": 1.1543165445327759 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6456445455551147, + "epoch": 6.78, + "learning_rate": 1.790645252183714e-05, + "loss": 0.8015, + "step": 8017, + "task_loss": 0.7076823115348816 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.030876874923706, + "epoch": 6.78, + "learning_rate": 1.7901756363294825e-05, + "loss": 0.7952, + "step": 8018, + "task_loss": 2.3905186653137207 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3849503695964813, + "epoch": 6.78, + "learning_rate": 1.7897060204752514e-05, + "loss": 0.7707, + "step": 8019, + "task_loss": 0.22061610221862793 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8599023818969727, + "epoch": 6.78, + "learning_rate": 1.78923640462102e-05, + "loss": 0.6732, + "step": 8020, + "task_loss": 0.8688079118728638 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8625783920288086, + "epoch": 6.78, + "learning_rate": 1.788766788766789e-05, + "loss": 0.8248, + "step": 8021, + "task_loss": 0.9139476418495178 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9382263422012329, + "epoch": 6.78, + "learning_rate": 1.7882971729125573e-05, + "loss": 0.8145, + "step": 8022, + "task_loss": 1.362126350402832 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.34017205238342285, + "epoch": 6.78, + "learning_rate": 1.7878275570583263e-05, + "loss": 0.6446, + "step": 8023, + "task_loss": 0.1698073148727417 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6742199659347534, + "epoch": 6.78, + "learning_rate": 1.787357941204095e-05, + "loss": 0.6398, + "step": 8024, + "task_loss": 0.8295663595199585 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3501206636428833, + "epoch": 6.78, + "learning_rate": 1.786888325349864e-05, + "loss": 0.5796, + "step": 8025, + "task_loss": 0.5304601192474365 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7201451063156128, + "epoch": 6.78, + "learning_rate": 1.786418709495633e-05, + "loss": 0.6694, + "step": 8026, + "task_loss": 0.7510545253753662 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5976323485374451, + "epoch": 6.78, + "learning_rate": 1.7859490936414015e-05, + "loss": 0.6548, + "step": 8027, + "task_loss": 1.1790045499801636 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4399861693382263, + "epoch": 6.79, + "learning_rate": 1.78547947778717e-05, + "loss": 0.6176, + "step": 8028, + "task_loss": 0.5377351641654968 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8051717281341553, + "epoch": 6.79, + "learning_rate": 1.7850098619329388e-05, + "loss": 0.7058, + "step": 8029, + "task_loss": 1.3748936653137207 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.46252351999282837, + "epoch": 6.79, + "learning_rate": 1.7845402460787078e-05, + "loss": 0.7076, + "step": 8030, + "task_loss": 0.8910819888114929 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.2647662162780762, + "epoch": 6.79, + "learning_rate": 1.7840706302244764e-05, + "loss": 0.8755, + "step": 8031, + "task_loss": 1.759522557258606 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5914826989173889, + "epoch": 6.79, + "learning_rate": 1.7836010143702454e-05, + "loss": 0.7055, + "step": 8032, + "task_loss": 0.5943803787231445 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7254488468170166, + "epoch": 6.79, + "learning_rate": 1.783131398516014e-05, + "loss": 0.6026, + "step": 8033, + "task_loss": 0.7513352632522583 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7732395529747009, + "epoch": 6.79, + "learning_rate": 1.7826617826617826e-05, + "loss": 0.8287, + "step": 8034, + "task_loss": 0.6602270603179932 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5491492748260498, + "epoch": 6.79, + "learning_rate": 1.7821921668075513e-05, + "loss": 0.574, + "step": 8035, + "task_loss": 0.6980670690536499 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6102688312530518, + "epoch": 6.79, + "learning_rate": 1.7817225509533203e-05, + "loss": 0.7835, + "step": 8036, + "task_loss": 0.3817468583583832 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5911701917648315, + "epoch": 6.79, + "learning_rate": 1.781252935099089e-05, + "loss": 0.6996, + "step": 8037, + "task_loss": 0.412977933883667 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8916078805923462, + "epoch": 6.79, + "learning_rate": 1.780783319244858e-05, + "loss": 0.8117, + "step": 8038, + "task_loss": 1.0715104341506958 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5727194547653198, + "epoch": 6.79, + "learning_rate": 1.7803137033906265e-05, + "loss": 0.5881, + "step": 8039, + "task_loss": 0.7758819460868835 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4205164909362793, + "epoch": 6.8, + "learning_rate": 1.7798440875363955e-05, + "loss": 0.4401, + "step": 8040, + "task_loss": 0.3613310754299164 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.768086314201355, + "epoch": 6.8, + "learning_rate": 1.779374471682164e-05, + "loss": 0.8839, + "step": 8041, + "task_loss": 0.602063000202179 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5803719758987427, + "epoch": 6.8, + "learning_rate": 1.7789048558279327e-05, + "loss": 0.7653, + "step": 8042, + "task_loss": 0.3529641032218933 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.668161153793335, + "epoch": 6.8, + "learning_rate": 1.7784352399737017e-05, + "loss": 0.7048, + "step": 8043, + "task_loss": 0.22776482999324799 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.19819016754627228, + "epoch": 6.8, + "learning_rate": 1.7779656241194703e-05, + "loss": 0.6453, + "step": 8044, + "task_loss": 0.011714087799191475 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.2153489589691162, + "epoch": 6.8, + "learning_rate": 1.7774960082652393e-05, + "loss": 0.7828, + "step": 8045, + "task_loss": 1.0043518543243408 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9566121101379395, + "epoch": 6.8, + "learning_rate": 1.777026392411008e-05, + "loss": 0.8292, + "step": 8046, + "task_loss": 0.9423202276229858 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5960302352905273, + "epoch": 6.8, + "learning_rate": 1.7765567765567766e-05, + "loss": 0.5172, + "step": 8047, + "task_loss": 0.40406519174575806 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.629338800907135, + "epoch": 6.8, + "learning_rate": 1.7760871607025452e-05, + "loss": 0.8247, + "step": 8048, + "task_loss": 0.5215269327163696 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.597755491733551, + "epoch": 6.8, + "learning_rate": 1.7756175448483142e-05, + "loss": 0.7298, + "step": 8049, + "task_loss": 0.49606993794441223 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.40288883447647095, + "epoch": 6.8, + "learning_rate": 1.7751479289940828e-05, + "loss": 0.6285, + "step": 8050, + "task_loss": 0.9734256267547607 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5448126792907715, + "epoch": 6.81, + "learning_rate": 1.7746783131398518e-05, + "loss": 0.4602, + "step": 8051, + "task_loss": 0.40933969616889954 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6007298231124878, + "epoch": 6.81, + "learning_rate": 1.7742086972856204e-05, + "loss": 0.7772, + "step": 8052, + "task_loss": 0.45853662490844727 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8129710555076599, + "epoch": 6.81, + "learning_rate": 1.773739081431389e-05, + "loss": 0.7322, + "step": 8053, + "task_loss": 0.39477023482322693 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6558352708816528, + "epoch": 6.81, + "learning_rate": 1.7732694655771577e-05, + "loss": 0.6112, + "step": 8054, + "task_loss": 0.5795919895172119 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8077524900436401, + "epoch": 6.81, + "learning_rate": 1.7727998497229267e-05, + "loss": 0.7334, + "step": 8055, + "task_loss": 1.1315386295318604 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4297342896461487, + "epoch": 6.81, + "learning_rate": 1.7723302338686956e-05, + "loss": 0.6482, + "step": 8056, + "task_loss": 0.5464257597923279 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7266287207603455, + "epoch": 6.81, + "learning_rate": 1.7718606180144643e-05, + "loss": 0.6109, + "step": 8057, + "task_loss": 1.1522022485733032 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6117545366287231, + "epoch": 6.81, + "learning_rate": 1.7713910021602333e-05, + "loss": 0.5551, + "step": 8058, + "task_loss": 0.7950785160064697 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.47485682368278503, + "epoch": 6.81, + "learning_rate": 1.770921386306002e-05, + "loss": 0.8362, + "step": 8059, + "task_loss": 0.6836209297180176 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6119992733001709, + "epoch": 6.81, + "learning_rate": 1.7704517704517705e-05, + "loss": 0.7624, + "step": 8060, + "task_loss": 0.8337969183921814 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9443872570991516, + "epoch": 6.81, + "learning_rate": 1.769982154597539e-05, + "loss": 0.714, + "step": 8061, + "task_loss": 0.518577516078949 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8095872402191162, + "epoch": 6.81, + "learning_rate": 1.769512538743308e-05, + "loss": 0.6415, + "step": 8062, + "task_loss": 0.5041683912277222 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8358902335166931, + "epoch": 6.82, + "learning_rate": 1.7690429228890768e-05, + "loss": 0.7834, + "step": 8063, + "task_loss": 0.292248010635376 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5936229228973389, + "epoch": 6.82, + "learning_rate": 1.7685733070348457e-05, + "loss": 0.7463, + "step": 8064, + "task_loss": 0.23977041244506836 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6914138197898865, + "epoch": 6.82, + "learning_rate": 1.7681036911806144e-05, + "loss": 0.6652, + "step": 8065, + "task_loss": 0.2436043620109558 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5593518018722534, + "epoch": 6.82, + "learning_rate": 1.767634075326383e-05, + "loss": 0.5839, + "step": 8066, + "task_loss": 0.49997425079345703 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9429531097412109, + "epoch": 6.82, + "learning_rate": 1.7671644594721516e-05, + "loss": 0.8617, + "step": 8067, + "task_loss": 1.357405185699463 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5558225512504578, + "epoch": 6.82, + "learning_rate": 1.7666948436179206e-05, + "loss": 0.6137, + "step": 8068, + "task_loss": 0.5421656370162964 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3932081162929535, + "epoch": 6.82, + "learning_rate": 1.7662252277636892e-05, + "loss": 0.6137, + "step": 8069, + "task_loss": 0.5381183624267578 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6787765622138977, + "epoch": 6.82, + "learning_rate": 1.7657556119094582e-05, + "loss": 0.6853, + "step": 8070, + "task_loss": 0.5147101879119873 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7284848690032959, + "epoch": 6.82, + "learning_rate": 1.7652859960552272e-05, + "loss": 0.9313, + "step": 8071, + "task_loss": 1.4166628122329712 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6111432909965515, + "epoch": 6.82, + "learning_rate": 1.7648163802009958e-05, + "loss": 0.4862, + "step": 8072, + "task_loss": 0.7143856287002563 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.987689197063446, + "epoch": 6.82, + "learning_rate": 1.7643467643467645e-05, + "loss": 0.608, + "step": 8073, + "task_loss": 1.2131282091140747 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6877415180206299, + "epoch": 6.82, + "learning_rate": 1.763877148492533e-05, + "loss": 0.6034, + "step": 8074, + "task_loss": 0.7441884875297546 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4577544033527374, + "epoch": 6.83, + "learning_rate": 1.763407532638302e-05, + "loss": 0.8457, + "step": 8075, + "task_loss": 0.046392105519771576 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5881800651550293, + "epoch": 6.83, + "learning_rate": 1.7629379167840707e-05, + "loss": 0.5519, + "step": 8076, + "task_loss": 0.2376038283109665 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6128990650177002, + "epoch": 6.83, + "learning_rate": 1.7624683009298397e-05, + "loss": 0.6078, + "step": 8077, + "task_loss": 1.2950046062469482 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.44172385334968567, + "epoch": 6.83, + "learning_rate": 1.7619986850756083e-05, + "loss": 0.4872, + "step": 8078, + "task_loss": 1.2197988033294678 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3616688847541809, + "epoch": 6.83, + "learning_rate": 1.761529069221377e-05, + "loss": 0.5629, + "step": 8079, + "task_loss": 0.511985719203949 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7056217789649963, + "epoch": 6.83, + "learning_rate": 1.7610594533671456e-05, + "loss": 0.7298, + "step": 8080, + "task_loss": 1.1085784435272217 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.698265790939331, + "epoch": 6.83, + "learning_rate": 1.7605898375129145e-05, + "loss": 0.8441, + "step": 8081, + "task_loss": 1.2503331899642944 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5099778175354004, + "epoch": 6.83, + "learning_rate": 1.7601202216586832e-05, + "loss": 0.7416, + "step": 8082, + "task_loss": 0.18721826374530792 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8456506729125977, + "epoch": 6.83, + "learning_rate": 1.759650605804452e-05, + "loss": 0.8655, + "step": 8083, + "task_loss": 1.0812793970108032 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7680861949920654, + "epoch": 6.83, + "learning_rate": 1.7591809899502208e-05, + "loss": 0.7015, + "step": 8084, + "task_loss": 0.3023846745491028 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5395148396492004, + "epoch": 6.83, + "learning_rate": 1.7587113740959894e-05, + "loss": 0.7055, + "step": 8085, + "task_loss": 0.3873963952064514 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7604743242263794, + "epoch": 6.83, + "learning_rate": 1.7582417582417584e-05, + "loss": 0.6056, + "step": 8086, + "task_loss": 0.40063923597335815 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8563343286514282, + "epoch": 6.84, + "learning_rate": 1.757772142387527e-05, + "loss": 0.8563, + "step": 8087, + "task_loss": 1.076812744140625 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.44253695011138916, + "epoch": 6.84, + "learning_rate": 1.757302526533296e-05, + "loss": 0.5368, + "step": 8088, + "task_loss": 0.5698300004005432 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8797450661659241, + "epoch": 6.84, + "learning_rate": 1.7568329106790646e-05, + "loss": 0.6164, + "step": 8089, + "task_loss": 0.6212620139122009 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.393185019493103, + "epoch": 6.84, + "learning_rate": 1.7563632948248336e-05, + "loss": 0.593, + "step": 8090, + "task_loss": 0.2525624930858612 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7706207036972046, + "epoch": 6.84, + "learning_rate": 1.7558936789706022e-05, + "loss": 0.5679, + "step": 8091, + "task_loss": 0.38418513536453247 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8188756704330444, + "epoch": 6.84, + "learning_rate": 1.755424063116371e-05, + "loss": 0.5242, + "step": 8092, + "task_loss": 0.9630873799324036 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.333770751953125, + "epoch": 6.84, + "learning_rate": 1.7549544472621395e-05, + "loss": 0.8928, + "step": 8093, + "task_loss": 1.2351491451263428 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6036602854728699, + "epoch": 6.84, + "learning_rate": 1.7544848314079085e-05, + "loss": 0.613, + "step": 8094, + "task_loss": 0.3608337640762329 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8603923320770264, + "epoch": 6.84, + "learning_rate": 1.754015215553677e-05, + "loss": 0.6447, + "step": 8095, + "task_loss": 1.1995042562484741 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7729570269584656, + "epoch": 6.84, + "learning_rate": 1.753545599699446e-05, + "loss": 0.7472, + "step": 8096, + "task_loss": 0.7506545782089233 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6201485395431519, + "epoch": 6.84, + "learning_rate": 1.7530759838452147e-05, + "loss": 0.7093, + "step": 8097, + "task_loss": 1.1131349802017212 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4656873941421509, + "epoch": 6.84, + "learning_rate": 1.7526063679909834e-05, + "loss": 0.7223, + "step": 8098, + "task_loss": 0.3676879405975342 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6807432174682617, + "epoch": 6.85, + "learning_rate": 1.752136752136752e-05, + "loss": 0.7488, + "step": 8099, + "task_loss": 0.7314665913581848 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6171824336051941, + "epoch": 6.85, + "learning_rate": 1.751667136282521e-05, + "loss": 0.6109, + "step": 8100, + "task_loss": 0.6037335991859436 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6316799521446228, + "epoch": 6.85, + "learning_rate": 1.7511975204282896e-05, + "loss": 0.6375, + "step": 8101, + "task_loss": 1.289804458618164 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5462307929992676, + "epoch": 6.85, + "learning_rate": 1.7507279045740586e-05, + "loss": 0.7078, + "step": 8102, + "task_loss": 0.0904335007071495 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6417200565338135, + "epoch": 6.85, + "learning_rate": 1.7502582887198275e-05, + "loss": 0.7579, + "step": 8103, + "task_loss": 0.7018250823020935 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8444219827651978, + "epoch": 6.85, + "learning_rate": 1.749788672865596e-05, + "loss": 0.8774, + "step": 8104, + "task_loss": 1.3110814094543457 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5336322784423828, + "epoch": 6.85, + "learning_rate": 1.7493190570113648e-05, + "loss": 0.6376, + "step": 8105, + "task_loss": 1.0098947286605835 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6110276579856873, + "epoch": 6.85, + "learning_rate": 1.7488494411571334e-05, + "loss": 0.663, + "step": 8106, + "task_loss": 1.1007336378097534 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5509042739868164, + "epoch": 6.85, + "learning_rate": 1.7483798253029024e-05, + "loss": 0.4717, + "step": 8107, + "task_loss": 1.2208508253097534 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.45111000537872314, + "epoch": 6.85, + "learning_rate": 1.747910209448671e-05, + "loss": 0.7191, + "step": 8108, + "task_loss": 0.6024281978607178 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8496984243392944, + "epoch": 6.85, + "learning_rate": 1.74744059359444e-05, + "loss": 0.8678, + "step": 8109, + "task_loss": 0.5414817333221436 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6412103772163391, + "epoch": 6.85, + "learning_rate": 1.7469709777402087e-05, + "loss": 0.6477, + "step": 8110, + "task_loss": 1.0421559810638428 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7199171185493469, + "epoch": 6.86, + "learning_rate": 1.7465013618859773e-05, + "loss": 0.6289, + "step": 8111, + "task_loss": 0.5767414569854736 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.2849491238594055, + "epoch": 6.86, + "learning_rate": 1.746031746031746e-05, + "loss": 0.5021, + "step": 8112, + "task_loss": 0.2874775230884552 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9090442657470703, + "epoch": 6.86, + "learning_rate": 1.745562130177515e-05, + "loss": 0.7602, + "step": 8113, + "task_loss": 1.7106091976165771 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5548120141029358, + "epoch": 6.86, + "learning_rate": 1.7450925143232835e-05, + "loss": 0.7201, + "step": 8114, + "task_loss": 0.20063790678977966 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5733697414398193, + "epoch": 6.86, + "learning_rate": 1.7446228984690525e-05, + "loss": 0.6138, + "step": 8115, + "task_loss": 0.4877995252609253 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.43681979179382324, + "epoch": 6.86, + "learning_rate": 1.744153282614821e-05, + "loss": 0.4679, + "step": 8116, + "task_loss": 0.22855518758296967 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6914660930633545, + "epoch": 6.86, + "learning_rate": 1.7436836667605898e-05, + "loss": 0.5971, + "step": 8117, + "task_loss": 0.8590465784072876 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5538546442985535, + "epoch": 6.86, + "learning_rate": 1.7432140509063587e-05, + "loss": 0.5766, + "step": 8118, + "task_loss": 0.6903925538063049 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9577622413635254, + "epoch": 6.86, + "learning_rate": 1.7427444350521274e-05, + "loss": 0.6942, + "step": 8119, + "task_loss": 1.4929178953170776 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6811849474906921, + "epoch": 6.86, + "learning_rate": 1.7422748191978963e-05, + "loss": 0.7369, + "step": 8120, + "task_loss": 0.5386447906494141 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6813338398933411, + "epoch": 6.86, + "learning_rate": 1.741805203343665e-05, + "loss": 0.7828, + "step": 8121, + "task_loss": 1.2732460498809814 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.986494779586792, + "epoch": 6.87, + "learning_rate": 1.741335587489434e-05, + "loss": 0.8676, + "step": 8122, + "task_loss": 1.2339004278182983 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6859373450279236, + "epoch": 6.87, + "learning_rate": 1.7408659716352026e-05, + "loss": 0.6468, + "step": 8123, + "task_loss": 0.7815605401992798 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6443341970443726, + "epoch": 6.87, + "learning_rate": 1.7403963557809712e-05, + "loss": 0.7532, + "step": 8124, + "task_loss": 1.8356502056121826 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6717321872711182, + "epoch": 6.87, + "learning_rate": 1.73992673992674e-05, + "loss": 0.6291, + "step": 8125, + "task_loss": 0.9082585573196411 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.588945209980011, + "epoch": 6.87, + "learning_rate": 1.7394571240725088e-05, + "loss": 0.8331, + "step": 8126, + "task_loss": 0.8817654252052307 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.1301257610321045, + "epoch": 6.87, + "learning_rate": 1.7389875082182775e-05, + "loss": 0.8745, + "step": 8127, + "task_loss": 1.6246498823165894 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5139785408973694, + "epoch": 6.87, + "learning_rate": 1.7385178923640464e-05, + "loss": 0.5089, + "step": 8128, + "task_loss": 0.14038360118865967 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.1417827606201172, + "epoch": 6.87, + "learning_rate": 1.738048276509815e-05, + "loss": 0.8875, + "step": 8129, + "task_loss": 0.6945188641548157 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.971738338470459, + "epoch": 6.87, + "learning_rate": 1.7375786606555837e-05, + "loss": 0.6669, + "step": 8130, + "task_loss": 0.3653593957424164 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.47721678018569946, + "epoch": 6.87, + "learning_rate": 1.7371090448013523e-05, + "loss": 0.629, + "step": 8131, + "task_loss": 0.8040969371795654 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.18415188789367676, + "epoch": 6.87, + "learning_rate": 1.7366394289471213e-05, + "loss": 0.6567, + "step": 8132, + "task_loss": 0.8115647435188293 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5667914748191833, + "epoch": 6.87, + "learning_rate": 1.7361698130928903e-05, + "loss": 0.5564, + "step": 8133, + "task_loss": 0.2141646444797516 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7211438417434692, + "epoch": 6.88, + "learning_rate": 1.735700197238659e-05, + "loss": 0.7305, + "step": 8134, + "task_loss": 1.0475313663482666 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7538830637931824, + "epoch": 6.88, + "learning_rate": 1.735230581384428e-05, + "loss": 0.696, + "step": 8135, + "task_loss": 1.1333682537078857 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.19587980210781097, + "epoch": 6.88, + "learning_rate": 1.7347609655301962e-05, + "loss": 0.4072, + "step": 8136, + "task_loss": 0.028609497472643852 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5208079814910889, + "epoch": 6.88, + "learning_rate": 1.734291349675965e-05, + "loss": 0.5723, + "step": 8137, + "task_loss": 1.4317752122879028 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.39321523904800415, + "epoch": 6.88, + "learning_rate": 1.7338217338217338e-05, + "loss": 0.6205, + "step": 8138, + "task_loss": 0.7644479870796204 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6411881446838379, + "epoch": 6.88, + "learning_rate": 1.7333521179675028e-05, + "loss": 0.8302, + "step": 8139, + "task_loss": 0.5537649989128113 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5100871324539185, + "epoch": 6.88, + "learning_rate": 1.7328825021132714e-05, + "loss": 0.5629, + "step": 8140, + "task_loss": 0.1531241089105606 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3914220929145813, + "epoch": 6.88, + "learning_rate": 1.7324128862590404e-05, + "loss": 0.563, + "step": 8141, + "task_loss": 0.10520809888839722 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.28038308024406433, + "epoch": 6.88, + "learning_rate": 1.731943270404809e-05, + "loss": 0.5928, + "step": 8142, + "task_loss": 0.29231780767440796 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.925789475440979, + "epoch": 6.88, + "learning_rate": 1.7314736545505776e-05, + "loss": 0.6737, + "step": 8143, + "task_loss": 0.6729716658592224 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7494056224822998, + "epoch": 6.88, + "learning_rate": 1.7310040386963463e-05, + "loss": 0.6501, + "step": 8144, + "task_loss": 0.5337806344032288 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6185156106948853, + "epoch": 6.88, + "learning_rate": 1.7305344228421152e-05, + "loss": 0.4341, + "step": 8145, + "task_loss": 0.7601549625396729 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6812223792076111, + "epoch": 6.89, + "learning_rate": 1.730064806987884e-05, + "loss": 1.1008, + "step": 8146, + "task_loss": 0.4211384356021881 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8308714628219604, + "epoch": 6.89, + "learning_rate": 1.729595191133653e-05, + "loss": 0.8542, + "step": 8147, + "task_loss": 0.368407279253006 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6035304665565491, + "epoch": 6.89, + "learning_rate": 1.7291255752794215e-05, + "loss": 0.7483, + "step": 8148, + "task_loss": 0.7557377815246582 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.37824946641921997, + "epoch": 6.89, + "learning_rate": 1.72865595942519e-05, + "loss": 0.4731, + "step": 8149, + "task_loss": 0.3331942558288574 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9033036231994629, + "epoch": 6.89, + "learning_rate": 1.728186343570959e-05, + "loss": 0.8013, + "step": 8150, + "task_loss": 0.6808444261550903 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9987159371376038, + "epoch": 6.89, + "learning_rate": 1.7277167277167277e-05, + "loss": 0.8089, + "step": 8151, + "task_loss": 1.4711419343948364 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6518374681472778, + "epoch": 6.89, + "learning_rate": 1.7272471118624967e-05, + "loss": 0.5299, + "step": 8152, + "task_loss": 0.41068917512893677 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8298474550247192, + "epoch": 6.89, + "learning_rate": 1.7267774960082653e-05, + "loss": 0.6501, + "step": 8153, + "task_loss": 1.5824090242385864 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7021873593330383, + "epoch": 6.89, + "learning_rate": 1.7263078801540343e-05, + "loss": 0.6806, + "step": 8154, + "task_loss": 0.9465674161911011 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5204759836196899, + "epoch": 6.89, + "learning_rate": 1.725838264299803e-05, + "loss": 0.6582, + "step": 8155, + "task_loss": 0.45224758982658386 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7647760510444641, + "epoch": 6.89, + "learning_rate": 1.7253686484455716e-05, + "loss": 0.7349, + "step": 8156, + "task_loss": 1.0484544038772583 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7216331958770752, + "epoch": 6.89, + "learning_rate": 1.7248990325913402e-05, + "loss": 0.8441, + "step": 8157, + "task_loss": 0.5260019302368164 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.762638509273529, + "epoch": 6.9, + "learning_rate": 1.7244294167371092e-05, + "loss": 0.5855, + "step": 8158, + "task_loss": 0.8446897268295288 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7615695595741272, + "epoch": 6.9, + "learning_rate": 1.7239598008828778e-05, + "loss": 0.8133, + "step": 8159, + "task_loss": 0.6793491840362549 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5369900465011597, + "epoch": 6.9, + "learning_rate": 1.7234901850286468e-05, + "loss": 0.8226, + "step": 8160, + "task_loss": 0.5459457039833069 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5788927674293518, + "epoch": 6.9, + "learning_rate": 1.7230205691744154e-05, + "loss": 0.7685, + "step": 8161, + "task_loss": 0.2308109998703003 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5146836042404175, + "epoch": 6.9, + "learning_rate": 1.722550953320184e-05, + "loss": 0.8333, + "step": 8162, + "task_loss": 0.2709534168243408 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4398704469203949, + "epoch": 6.9, + "learning_rate": 1.722081337465953e-05, + "loss": 0.8517, + "step": 8163, + "task_loss": 0.18044772744178772 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5392913818359375, + "epoch": 6.9, + "learning_rate": 1.7216117216117217e-05, + "loss": 0.6371, + "step": 8164, + "task_loss": 0.893333911895752 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.2595422565937042, + "epoch": 6.9, + "learning_rate": 1.7211421057574906e-05, + "loss": 0.6546, + "step": 8165, + "task_loss": 0.20396387577056885 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.512787938117981, + "epoch": 6.9, + "learning_rate": 1.7206724899032593e-05, + "loss": 0.6535, + "step": 8166, + "task_loss": 0.6848084926605225 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.40187567472457886, + "epoch": 6.9, + "learning_rate": 1.7202028740490282e-05, + "loss": 0.735, + "step": 8167, + "task_loss": 0.6131902933120728 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5052165389060974, + "epoch": 6.9, + "learning_rate": 1.7197332581947965e-05, + "loss": 0.6917, + "step": 8168, + "task_loss": 0.5867551565170288 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6432033777236938, + "epoch": 6.9, + "learning_rate": 1.7192636423405655e-05, + "loss": 0.7089, + "step": 8169, + "task_loss": 0.48835888504981995 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8751969337463379, + "epoch": 6.91, + "learning_rate": 1.718794026486334e-05, + "loss": 0.7051, + "step": 8170, + "task_loss": 0.9686394333839417 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7078771591186523, + "epoch": 6.91, + "learning_rate": 1.718324410632103e-05, + "loss": 0.7648, + "step": 8171, + "task_loss": 1.382512092590332 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5595644116401672, + "epoch": 6.91, + "learning_rate": 1.7178547947778718e-05, + "loss": 0.6997, + "step": 8172, + "task_loss": 1.0817697048187256 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6985122561454773, + "epoch": 6.91, + "learning_rate": 1.7173851789236407e-05, + "loss": 0.6749, + "step": 8173, + "task_loss": 1.4615943431854248 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0168942213058472, + "epoch": 6.91, + "learning_rate": 1.7169155630694094e-05, + "loss": 0.9048, + "step": 8174, + "task_loss": 1.4677472114562988 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.3328709602355957, + "epoch": 6.91, + "learning_rate": 1.716445947215178e-05, + "loss": 0.6923, + "step": 8175, + "task_loss": 0.9472116827964783 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5383782982826233, + "epoch": 6.91, + "learning_rate": 1.7159763313609466e-05, + "loss": 0.5835, + "step": 8176, + "task_loss": 0.19932466745376587 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5307043790817261, + "epoch": 6.91, + "learning_rate": 1.7155067155067156e-05, + "loss": 0.6935, + "step": 8177, + "task_loss": 2.285045862197876 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.44599249958992004, + "epoch": 6.91, + "learning_rate": 1.7150370996524842e-05, + "loss": 0.6981, + "step": 8178, + "task_loss": 0.3017326295375824 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3936731219291687, + "epoch": 6.91, + "learning_rate": 1.7145674837982532e-05, + "loss": 0.559, + "step": 8179, + "task_loss": 0.9211771488189697 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6341065764427185, + "epoch": 6.91, + "learning_rate": 1.714097867944022e-05, + "loss": 0.5361, + "step": 8180, + "task_loss": 0.5204980969429016 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9433261156082153, + "epoch": 6.91, + "learning_rate": 1.7136282520897905e-05, + "loss": 0.7306, + "step": 8181, + "task_loss": 0.9570217728614807 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3418322503566742, + "epoch": 6.92, + "learning_rate": 1.7131586362355594e-05, + "loss": 0.5553, + "step": 8182, + "task_loss": 0.7738521099090576 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5210729837417603, + "epoch": 6.92, + "learning_rate": 1.712689020381328e-05, + "loss": 0.8185, + "step": 8183, + "task_loss": 1.394591212272644 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9095636010169983, + "epoch": 6.92, + "learning_rate": 1.712219404527097e-05, + "loss": 0.7429, + "step": 8184, + "task_loss": 1.4811350107192993 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.57829749584198, + "epoch": 6.92, + "learning_rate": 1.7117497886728657e-05, + "loss": 0.5109, + "step": 8185, + "task_loss": 0.3243939280509949 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6516492962837219, + "epoch": 6.92, + "learning_rate": 1.7112801728186347e-05, + "loss": 0.6236, + "step": 8186, + "task_loss": 1.203939437866211 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3501132130622864, + "epoch": 6.92, + "learning_rate": 1.710810556964403e-05, + "loss": 0.5855, + "step": 8187, + "task_loss": 0.10756329447031021 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6217337846755981, + "epoch": 6.92, + "learning_rate": 1.710340941110172e-05, + "loss": 0.5081, + "step": 8188, + "task_loss": 0.5090513825416565 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6538976430892944, + "epoch": 6.92, + "learning_rate": 1.7098713252559406e-05, + "loss": 0.6761, + "step": 8189, + "task_loss": 0.5372185111045837 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.443137526512146, + "epoch": 6.92, + "learning_rate": 1.7094017094017095e-05, + "loss": 0.6905, + "step": 8190, + "task_loss": 1.4223047494888306 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4753764867782593, + "epoch": 6.92, + "learning_rate": 1.7089320935474782e-05, + "loss": 0.4717, + "step": 8191, + "task_loss": 0.6580274701118469 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0920145511627197, + "epoch": 6.92, + "learning_rate": 1.708462477693247e-05, + "loss": 0.9592, + "step": 8192, + "task_loss": 0.6633914113044739 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.775753378868103, + "epoch": 6.93, + "learning_rate": 1.7079928618390158e-05, + "loss": 0.7043, + "step": 8193, + "task_loss": 0.6734439730644226 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8380321860313416, + "epoch": 6.93, + "learning_rate": 1.7075232459847844e-05, + "loss": 0.5941, + "step": 8194, + "task_loss": 0.4275500774383545 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6615475416183472, + "epoch": 6.93, + "learning_rate": 1.7070536301305534e-05, + "loss": 0.6263, + "step": 8195, + "task_loss": 0.315200537443161 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.85567706823349, + "epoch": 6.93, + "learning_rate": 1.706584014276322e-05, + "loss": 0.7725, + "step": 8196, + "task_loss": 0.892385721206665 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.3411781787872314, + "epoch": 6.93, + "learning_rate": 1.706114398422091e-05, + "loss": 0.8573, + "step": 8197, + "task_loss": 1.0106481313705444 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6430004835128784, + "epoch": 6.93, + "learning_rate": 1.7056447825678596e-05, + "loss": 0.7649, + "step": 8198, + "task_loss": 0.3745318353176117 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5760970711708069, + "epoch": 6.93, + "learning_rate": 1.7051751667136283e-05, + "loss": 0.5526, + "step": 8199, + "task_loss": 1.0307451486587524 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5167843103408813, + "epoch": 6.93, + "learning_rate": 1.704705550859397e-05, + "loss": 0.6333, + "step": 8200, + "task_loss": 0.3765457570552826 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.35068458318710327, + "epoch": 6.93, + "learning_rate": 1.704235935005166e-05, + "loss": 0.8303, + "step": 8201, + "task_loss": 0.7199140191078186 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6123879551887512, + "epoch": 6.93, + "learning_rate": 1.7037663191509345e-05, + "loss": 0.5824, + "step": 8202, + "task_loss": 1.0390218496322632 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0084372758865356, + "epoch": 6.93, + "learning_rate": 1.7032967032967035e-05, + "loss": 0.7282, + "step": 8203, + "task_loss": 1.3634339570999146 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.502724826335907, + "epoch": 6.93, + "learning_rate": 1.702827087442472e-05, + "loss": 0.5184, + "step": 8204, + "task_loss": 0.4759935140609741 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4825650453567505, + "epoch": 6.94, + "learning_rate": 1.702357471588241e-05, + "loss": 0.6468, + "step": 8205, + "task_loss": 0.1319652497768402 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6304592490196228, + "epoch": 6.94, + "learning_rate": 1.7018878557340097e-05, + "loss": 0.7158, + "step": 8206, + "task_loss": 1.5240745544433594 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5963502526283264, + "epoch": 6.94, + "learning_rate": 1.7014182398797783e-05, + "loss": 0.5952, + "step": 8207, + "task_loss": 0.43074753880500793 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.30828818678855896, + "epoch": 6.94, + "learning_rate": 1.700948624025547e-05, + "loss": 0.6051, + "step": 8208, + "task_loss": 0.3159300684928894 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5868615508079529, + "epoch": 6.94, + "learning_rate": 1.700479008171316e-05, + "loss": 0.6904, + "step": 8209, + "task_loss": 0.8068860769271851 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7866762280464172, + "epoch": 6.94, + "learning_rate": 1.700009392317085e-05, + "loss": 0.854, + "step": 8210, + "task_loss": 1.5475836992263794 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6057859659194946, + "epoch": 6.94, + "learning_rate": 1.6995397764628536e-05, + "loss": 0.5502, + "step": 8211, + "task_loss": 1.2508349418640137 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.33474215865135193, + "epoch": 6.94, + "learning_rate": 1.6990701606086222e-05, + "loss": 0.6884, + "step": 8212, + "task_loss": 0.46864303946495056 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3510148227214813, + "epoch": 6.94, + "learning_rate": 1.6986005447543908e-05, + "loss": 0.7428, + "step": 8213, + "task_loss": 0.5251845717430115 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8328485488891602, + "epoch": 6.94, + "learning_rate": 1.6981309289001598e-05, + "loss": 0.6434, + "step": 8214, + "task_loss": 0.4234258532524109 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0460220575332642, + "epoch": 6.94, + "learning_rate": 1.6976613130459284e-05, + "loss": 0.7528, + "step": 8215, + "task_loss": 0.9814617037773132 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8307148814201355, + "epoch": 6.94, + "learning_rate": 1.6971916971916974e-05, + "loss": 0.7871, + "step": 8216, + "task_loss": 0.4819900393486023 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6192447543144226, + "epoch": 6.95, + "learning_rate": 1.696722081337466e-05, + "loss": 0.6727, + "step": 8217, + "task_loss": 0.8150542974472046 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5546470284461975, + "epoch": 6.95, + "learning_rate": 1.696252465483235e-05, + "loss": 0.6425, + "step": 8218, + "task_loss": 1.4672889709472656 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7491776943206787, + "epoch": 6.95, + "learning_rate": 1.6957828496290033e-05, + "loss": 0.6271, + "step": 8219, + "task_loss": 0.7813766598701477 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7578977346420288, + "epoch": 6.95, + "learning_rate": 1.6953132337747723e-05, + "loss": 0.9538, + "step": 8220, + "task_loss": 0.643196165561676 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8453570008277893, + "epoch": 6.95, + "learning_rate": 1.694843617920541e-05, + "loss": 0.6284, + "step": 8221, + "task_loss": 1.1305489540100098 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7399015426635742, + "epoch": 6.95, + "learning_rate": 1.69437400206631e-05, + "loss": 0.6772, + "step": 8222, + "task_loss": 0.8548632264137268 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8026523590087891, + "epoch": 6.95, + "learning_rate": 1.6939043862120785e-05, + "loss": 0.7572, + "step": 8223, + "task_loss": 0.8651464581489563 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4790987968444824, + "epoch": 6.95, + "learning_rate": 1.6934347703578475e-05, + "loss": 0.6768, + "step": 8224, + "task_loss": 0.1657431274652481 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8209432363510132, + "epoch": 6.95, + "learning_rate": 1.692965154503616e-05, + "loss": 0.8105, + "step": 8225, + "task_loss": 0.30885183811187744 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6117878556251526, + "epoch": 6.95, + "learning_rate": 1.6924955386493848e-05, + "loss": 0.7415, + "step": 8226, + "task_loss": 0.4352125823497772 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5691406726837158, + "epoch": 6.95, + "learning_rate": 1.6920259227951537e-05, + "loss": 0.6472, + "step": 8227, + "task_loss": 0.7705469727516174 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.718815267086029, + "epoch": 6.95, + "learning_rate": 1.6915563069409224e-05, + "loss": 0.6092, + "step": 8228, + "task_loss": 0.5996741056442261 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0849266052246094, + "epoch": 6.96, + "learning_rate": 1.6910866910866913e-05, + "loss": 0.7904, + "step": 8229, + "task_loss": 0.7435019016265869 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9595440626144409, + "epoch": 6.96, + "learning_rate": 1.69061707523246e-05, + "loss": 0.8335, + "step": 8230, + "task_loss": 0.5913066267967224 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7968192100524902, + "epoch": 6.96, + "learning_rate": 1.6901474593782286e-05, + "loss": 0.6633, + "step": 8231, + "task_loss": 0.7108122706413269 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7491438388824463, + "epoch": 6.96, + "learning_rate": 1.6896778435239972e-05, + "loss": 0.7418, + "step": 8232, + "task_loss": 1.3001651763916016 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.45360925793647766, + "epoch": 6.96, + "learning_rate": 1.6892082276697662e-05, + "loss": 0.4893, + "step": 8233, + "task_loss": 0.18607290089130402 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5600759983062744, + "epoch": 6.96, + "learning_rate": 1.688738611815535e-05, + "loss": 0.6194, + "step": 8234, + "task_loss": 0.6662044525146484 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.1775197982788086, + "epoch": 6.96, + "learning_rate": 1.6882689959613038e-05, + "loss": 0.807, + "step": 8235, + "task_loss": 1.0509095191955566 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.37883633375167847, + "epoch": 6.96, + "learning_rate": 1.6877993801070725e-05, + "loss": 0.5439, + "step": 8236, + "task_loss": 0.5414415597915649 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5961737632751465, + "epoch": 6.96, + "learning_rate": 1.6873297642528414e-05, + "loss": 0.6374, + "step": 8237, + "task_loss": 1.0837223529815674 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.55787593126297, + "epoch": 6.96, + "learning_rate": 1.68686014839861e-05, + "loss": 0.6399, + "step": 8238, + "task_loss": 0.5217234492301941 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5135990381240845, + "epoch": 6.96, + "learning_rate": 1.6863905325443787e-05, + "loss": 0.6265, + "step": 8239, + "task_loss": 0.3302956223487854 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7488323450088501, + "epoch": 6.96, + "learning_rate": 1.6859209166901477e-05, + "loss": 0.6831, + "step": 8240, + "task_loss": 0.3625866174697876 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5580921769142151, + "epoch": 6.97, + "learning_rate": 1.6854513008359163e-05, + "loss": 0.7438, + "step": 8241, + "task_loss": 0.28112518787384033 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.46719154715538025, + "epoch": 6.97, + "learning_rate": 1.6849816849816853e-05, + "loss": 0.5208, + "step": 8242, + "task_loss": 0.8604084253311157 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.657219409942627, + "epoch": 6.97, + "learning_rate": 1.684512069127454e-05, + "loss": 0.7774, + "step": 8243, + "task_loss": 0.22282832860946655 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.2052643299102783, + "epoch": 6.97, + "learning_rate": 1.6840424532732225e-05, + "loss": 0.8043, + "step": 8244, + "task_loss": 1.2041168212890625 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.913894772529602, + "epoch": 6.97, + "learning_rate": 1.6835728374189912e-05, + "loss": 0.7204, + "step": 8245, + "task_loss": 1.4525535106658936 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9503656029701233, + "epoch": 6.97, + "learning_rate": 1.68310322156476e-05, + "loss": 0.846, + "step": 8246, + "task_loss": 0.8675366640090942 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8129103183746338, + "epoch": 6.97, + "learning_rate": 1.6826336057105288e-05, + "loss": 0.6287, + "step": 8247, + "task_loss": 0.5677782893180847 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8578076362609863, + "epoch": 6.97, + "learning_rate": 1.6821639898562978e-05, + "loss": 0.7832, + "step": 8248, + "task_loss": 0.9151036739349365 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5810949802398682, + "epoch": 6.97, + "learning_rate": 1.6816943740020664e-05, + "loss": 0.6398, + "step": 8249, + "task_loss": 0.9931421875953674 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.1725833415985107, + "epoch": 6.97, + "learning_rate": 1.6812247581478354e-05, + "loss": 1.0306, + "step": 8250, + "task_loss": 1.592797040939331 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3865169286727905, + "epoch": 6.97, + "learning_rate": 1.6807551422936037e-05, + "loss": 0.4941, + "step": 8251, + "task_loss": 0.2566170394420624 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8158028721809387, + "epoch": 6.97, + "learning_rate": 1.6802855264393726e-05, + "loss": 0.7687, + "step": 8252, + "task_loss": 0.6082451939582825 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6630833148956299, + "epoch": 6.98, + "learning_rate": 1.6798159105851413e-05, + "loss": 0.665, + "step": 8253, + "task_loss": 0.18085965514183044 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6660779714584351, + "epoch": 6.98, + "learning_rate": 1.6793462947309102e-05, + "loss": 0.6627, + "step": 8254, + "task_loss": 1.0900431871414185 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6251408457756042, + "epoch": 6.98, + "learning_rate": 1.678876678876679e-05, + "loss": 0.7099, + "step": 8255, + "task_loss": 0.817121684551239 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5671027898788452, + "epoch": 6.98, + "learning_rate": 1.678407063022448e-05, + "loss": 0.5794, + "step": 8256, + "task_loss": 0.7014181017875671 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3188495635986328, + "epoch": 6.98, + "learning_rate": 1.6779374471682165e-05, + "loss": 0.5816, + "step": 8257, + "task_loss": 0.0567346066236496 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9631364941596985, + "epoch": 6.98, + "learning_rate": 1.677467831313985e-05, + "loss": 0.7888, + "step": 8258, + "task_loss": 0.7746395468711853 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3986247479915619, + "epoch": 6.98, + "learning_rate": 1.676998215459754e-05, + "loss": 0.7492, + "step": 8259, + "task_loss": 2.021054267883301 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0316064357757568, + "epoch": 6.98, + "learning_rate": 1.6765285996055227e-05, + "loss": 1.127, + "step": 8260, + "task_loss": 1.0009039640426636 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9992505311965942, + "epoch": 6.98, + "learning_rate": 1.6760589837512917e-05, + "loss": 0.8928, + "step": 8261, + "task_loss": 0.9072684645652771 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9369519948959351, + "epoch": 6.98, + "learning_rate": 1.6755893678970603e-05, + "loss": 0.9967, + "step": 8262, + "task_loss": 1.648752212524414 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3783581256866455, + "epoch": 6.98, + "learning_rate": 1.675119752042829e-05, + "loss": 0.5941, + "step": 8263, + "task_loss": 0.586639404296875 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.725365161895752, + "epoch": 6.99, + "learning_rate": 1.6746501361885976e-05, + "loss": 0.8363, + "step": 8264, + "task_loss": 0.7603028416633606 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6735140085220337, + "epoch": 6.99, + "learning_rate": 1.6741805203343666e-05, + "loss": 0.7064, + "step": 8265, + "task_loss": 0.7526024580001831 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5585086345672607, + "epoch": 6.99, + "learning_rate": 1.6737109044801352e-05, + "loss": 0.5811, + "step": 8266, + "task_loss": 0.31808507442474365 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8583345413208008, + "epoch": 6.99, + "learning_rate": 1.6732412886259042e-05, + "loss": 0.651, + "step": 8267, + "task_loss": 0.9813191890716553 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6888899803161621, + "epoch": 6.99, + "learning_rate": 1.6727716727716728e-05, + "loss": 0.6286, + "step": 8268, + "task_loss": 0.8632363080978394 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.18807867169380188, + "epoch": 6.99, + "learning_rate": 1.6723020569174418e-05, + "loss": 0.4509, + "step": 8269, + "task_loss": 0.0150699932128191 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5483200550079346, + "epoch": 6.99, + "learning_rate": 1.67183244106321e-05, + "loss": 0.5577, + "step": 8270, + "task_loss": 0.9129177331924438 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8738926649093628, + "epoch": 6.99, + "learning_rate": 1.671362825208979e-05, + "loss": 0.7083, + "step": 8271, + "task_loss": 0.6949505805969238 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.988156795501709, + "epoch": 6.99, + "learning_rate": 1.670893209354748e-05, + "loss": 0.7352, + "step": 8272, + "task_loss": 1.2114613056182861 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3697923421859741, + "epoch": 6.99, + "learning_rate": 1.6704235935005167e-05, + "loss": 0.6265, + "step": 8273, + "task_loss": 0.45769426226615906 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4095662534236908, + "epoch": 6.99, + "learning_rate": 1.6699539776462856e-05, + "loss": 0.6469, + "step": 8274, + "task_loss": 0.5900315642356873 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7218039035797119, + "epoch": 6.99, + "learning_rate": 1.6694843617920543e-05, + "loss": 0.7413, + "step": 8275, + "task_loss": 1.0810508728027344 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8326165676116943, + "epoch": 7.0, + "learning_rate": 1.669014745937823e-05, + "loss": 0.7588, + "step": 8276, + "task_loss": 1.031529426574707 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.25207871198654175, + "epoch": 7.0, + "learning_rate": 1.6685451300835915e-05, + "loss": 0.5325, + "step": 8277, + "task_loss": 1.3076504468917847 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.1497375965118408, + "epoch": 7.0, + "learning_rate": 1.6680755142293605e-05, + "loss": 0.7355, + "step": 8278, + "task_loss": 1.0966205596923828 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.46010109782218933, + "epoch": 7.0, + "learning_rate": 1.667605898375129e-05, + "loss": 0.6855, + "step": 8279, + "task_loss": 0.6017349362373352 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5598931908607483, + "epoch": 7.0, + "learning_rate": 1.667136282520898e-05, + "loss": 0.703, + "step": 8280, + "task_loss": 0.30978474020957947 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.38397496938705444, + "epoch": 7.0, + "learning_rate": 1.6666666666666667e-05, + "loss": 0.8206, + "step": 8281, + "task_loss": 0.09734808653593063 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6556918621063232, + "epoch": 7.0, + "learning_rate": 1.6661970508124354e-05, + "loss": 1.2385, + "step": 8282, + "task_loss": 1.4574151039123535 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4400886297225952, + "epoch": 7.0, + "learning_rate": 1.665727434958204e-05, + "loss": 0.5112, + "step": 8283, + "task_loss": 0.4875570833683014 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7489319443702698, + "epoch": 7.0, + "learning_rate": 1.665257819103973e-05, + "loss": 0.8046, + "step": 8284, + "task_loss": 0.8576010465621948 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7353480458259583, + "epoch": 7.0, + "learning_rate": 1.6647882032497416e-05, + "loss": 0.9708, + "step": 8285, + "task_loss": 0.8593668341636658 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.44553419947624207, + "epoch": 7.0, + "learning_rate": 1.6643185873955106e-05, + "loss": 0.5505, + "step": 8286, + "task_loss": 1.1241674423217773 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8107983469963074, + "epoch": 7.01, + "learning_rate": 1.6638489715412796e-05, + "loss": 0.6563, + "step": 8287, + "task_loss": 1.1499189138412476 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7141011357307434, + "epoch": 7.01, + "learning_rate": 1.6633793556870482e-05, + "loss": 0.6378, + "step": 8288, + "task_loss": 0.29104092717170715 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4113994240760803, + "epoch": 7.01, + "learning_rate": 1.662909739832817e-05, + "loss": 0.4574, + "step": 8289, + "task_loss": 0.6397565603256226 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4586312174797058, + "epoch": 7.01, + "learning_rate": 1.6624401239785855e-05, + "loss": 0.6528, + "step": 8290, + "task_loss": 0.5020477175712585 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.1405751705169678, + "epoch": 7.01, + "learning_rate": 1.6619705081243544e-05, + "loss": 0.8842, + "step": 8291, + "task_loss": 1.1339741945266724 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5416803359985352, + "epoch": 7.01, + "learning_rate": 1.661500892270123e-05, + "loss": 0.5349, + "step": 8292, + "task_loss": 0.3730884790420532 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.299846351146698, + "epoch": 7.01, + "learning_rate": 1.661031276415892e-05, + "loss": 0.5233, + "step": 8293, + "task_loss": 0.20945407450199127 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7444020509719849, + "epoch": 7.01, + "learning_rate": 1.6605616605616607e-05, + "loss": 0.7975, + "step": 8294, + "task_loss": 1.6628270149230957 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.43929219245910645, + "epoch": 7.01, + "learning_rate": 1.6600920447074293e-05, + "loss": 0.5296, + "step": 8295, + "task_loss": 0.13940246403217316 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3861343562602997, + "epoch": 7.01, + "learning_rate": 1.659622428853198e-05, + "loss": 0.6913, + "step": 8296, + "task_loss": 0.04676276072859764 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7730162739753723, + "epoch": 7.01, + "learning_rate": 1.659152812998967e-05, + "loss": 0.6389, + "step": 8297, + "task_loss": 1.090760588645935 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4901772439479828, + "epoch": 7.01, + "learning_rate": 1.6586831971447356e-05, + "loss": 0.4891, + "step": 8298, + "task_loss": 0.43117621541023254 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7656208276748657, + "epoch": 7.02, + "learning_rate": 1.6582135812905045e-05, + "loss": 0.8823, + "step": 8299, + "task_loss": 0.6806660294532776 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.1021418571472168, + "epoch": 7.02, + "learning_rate": 1.657743965436273e-05, + "loss": 0.7504, + "step": 8300, + "task_loss": 1.0124170780181885 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6733958721160889, + "epoch": 7.02, + "learning_rate": 1.657274349582042e-05, + "loss": 0.7511, + "step": 8301, + "task_loss": 0.6540461182594299 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.1094497442245483, + "epoch": 7.02, + "learning_rate": 1.6568047337278108e-05, + "loss": 0.6181, + "step": 8302, + "task_loss": 0.6314953565597534 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8155375719070435, + "epoch": 7.02, + "learning_rate": 1.6563351178735794e-05, + "loss": 0.7071, + "step": 8303, + "task_loss": 0.8100626468658447 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4388219714164734, + "epoch": 7.02, + "learning_rate": 1.6558655020193484e-05, + "loss": 0.5276, + "step": 8304, + "task_loss": 0.7258274555206299 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.894784688949585, + "epoch": 7.02, + "learning_rate": 1.655395886165117e-05, + "loss": 0.6891, + "step": 8305, + "task_loss": 1.188578486442566 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8514228463172913, + "epoch": 7.02, + "learning_rate": 1.654926270310886e-05, + "loss": 0.6633, + "step": 8306, + "task_loss": 0.9450716376304626 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6548452377319336, + "epoch": 7.02, + "learning_rate": 1.6544566544566546e-05, + "loss": 0.7705, + "step": 8307, + "task_loss": 0.5335700511932373 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5354621410369873, + "epoch": 7.02, + "learning_rate": 1.6539870386024233e-05, + "loss": 0.6857, + "step": 8308, + "task_loss": 0.8653998374938965 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7935572266578674, + "epoch": 7.02, + "learning_rate": 1.653517422748192e-05, + "loss": 0.5743, + "step": 8309, + "task_loss": 0.7053214311599731 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3738332986831665, + "epoch": 7.02, + "learning_rate": 1.653047806893961e-05, + "loss": 0.4891, + "step": 8310, + "task_loss": 0.6152140498161316 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4086637794971466, + "epoch": 7.03, + "learning_rate": 1.6525781910397295e-05, + "loss": 0.4943, + "step": 8311, + "task_loss": 0.6559181213378906 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5409867763519287, + "epoch": 7.03, + "learning_rate": 1.6521085751854985e-05, + "loss": 0.5073, + "step": 8312, + "task_loss": 0.6867883205413818 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5739309787750244, + "epoch": 7.03, + "learning_rate": 1.651638959331267e-05, + "loss": 0.6707, + "step": 8313, + "task_loss": 0.4145790636539459 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.47136417031288147, + "epoch": 7.03, + "learning_rate": 1.6511693434770357e-05, + "loss": 0.5862, + "step": 8314, + "task_loss": 1.1957221031188965 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.2273435592651367, + "epoch": 7.03, + "learning_rate": 1.6506997276228044e-05, + "loss": 0.7451, + "step": 8315, + "task_loss": 2.7351536750793457 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9121997356414795, + "epoch": 7.03, + "learning_rate": 1.6502301117685733e-05, + "loss": 0.7575, + "step": 8316, + "task_loss": 0.8681555390357971 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6077507734298706, + "epoch": 7.03, + "learning_rate": 1.6497604959143423e-05, + "loss": 0.6655, + "step": 8317, + "task_loss": 0.4065805971622467 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.23441100120544434, + "epoch": 7.03, + "learning_rate": 1.649290880060111e-05, + "loss": 0.6035, + "step": 8318, + "task_loss": 0.08628642559051514 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6208431720733643, + "epoch": 7.03, + "learning_rate": 1.64882126420588e-05, + "loss": 0.6877, + "step": 8319, + "task_loss": 0.4308265745639801 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5482019782066345, + "epoch": 7.03, + "learning_rate": 1.6483516483516486e-05, + "loss": 0.615, + "step": 8320, + "task_loss": 0.5773104429244995 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7421352863311768, + "epoch": 7.03, + "learning_rate": 1.6478820324974172e-05, + "loss": 0.513, + "step": 8321, + "task_loss": 0.9875578880310059 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8845319747924805, + "epoch": 7.03, + "learning_rate": 1.6474124166431858e-05, + "loss": 0.6105, + "step": 8322, + "task_loss": 1.1386936902999878 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6981757283210754, + "epoch": 7.04, + "learning_rate": 1.6469428007889548e-05, + "loss": 0.7267, + "step": 8323, + "task_loss": 1.6347992420196533 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8518170118331909, + "epoch": 7.04, + "learning_rate": 1.6464731849347234e-05, + "loss": 0.6639, + "step": 8324, + "task_loss": 0.5977916121482849 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3985435962677002, + "epoch": 7.04, + "learning_rate": 1.6460035690804924e-05, + "loss": 0.7791, + "step": 8325, + "task_loss": 0.09390806406736374 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.38015830516815186, + "epoch": 7.04, + "learning_rate": 1.645533953226261e-05, + "loss": 0.4097, + "step": 8326, + "task_loss": 0.5957505106925964 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.49693095684051514, + "epoch": 7.04, + "learning_rate": 1.6450643373720297e-05, + "loss": 0.6061, + "step": 8327, + "task_loss": 1.3102939128875732 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9543126821517944, + "epoch": 7.04, + "learning_rate": 1.6445947215177983e-05, + "loss": 0.7145, + "step": 8328, + "task_loss": 1.869619369506836 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4659358561038971, + "epoch": 7.04, + "learning_rate": 1.6441251056635673e-05, + "loss": 0.4675, + "step": 8329, + "task_loss": 0.3125387728214264 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5740473866462708, + "epoch": 7.04, + "learning_rate": 1.643655489809336e-05, + "loss": 0.6931, + "step": 8330, + "task_loss": 1.0422579050064087 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8233962059020996, + "epoch": 7.04, + "learning_rate": 1.643185873955105e-05, + "loss": 0.6962, + "step": 8331, + "task_loss": 0.5217673778533936 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5062025189399719, + "epoch": 7.04, + "learning_rate": 1.6427162581008735e-05, + "loss": 0.5152, + "step": 8332, + "task_loss": 0.8515752553939819 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6172130107879639, + "epoch": 7.04, + "learning_rate": 1.642246642246642e-05, + "loss": 0.5847, + "step": 8333, + "task_loss": 0.22528043389320374 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.750946581363678, + "epoch": 7.04, + "learning_rate": 1.641777026392411e-05, + "loss": 0.6526, + "step": 8334, + "task_loss": 0.5244506001472473 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5777402520179749, + "epoch": 7.05, + "learning_rate": 1.6413074105381798e-05, + "loss": 0.8268, + "step": 8335, + "task_loss": 0.5747224688529968 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5651198625564575, + "epoch": 7.05, + "learning_rate": 1.6408377946839487e-05, + "loss": 0.7463, + "step": 8336, + "task_loss": 1.891367793083191 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7866872549057007, + "epoch": 7.05, + "learning_rate": 1.6403681788297174e-05, + "loss": 0.6005, + "step": 8337, + "task_loss": 0.4510728716850281 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.44750288128852844, + "epoch": 7.05, + "learning_rate": 1.6398985629754863e-05, + "loss": 0.5219, + "step": 8338, + "task_loss": 0.362091600894928 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6712720394134521, + "epoch": 7.05, + "learning_rate": 1.639428947121255e-05, + "loss": 0.6354, + "step": 8339, + "task_loss": 1.7426300048828125 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4043712019920349, + "epoch": 7.05, + "learning_rate": 1.6389593312670236e-05, + "loss": 0.5755, + "step": 8340, + "task_loss": 0.3911268711090088 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6254101991653442, + "epoch": 7.05, + "learning_rate": 1.6384897154127922e-05, + "loss": 0.6563, + "step": 8341, + "task_loss": 0.97128826379776 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0306541919708252, + "epoch": 7.05, + "learning_rate": 1.6380200995585612e-05, + "loss": 0.9302, + "step": 8342, + "task_loss": 0.9813125133514404 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7132619619369507, + "epoch": 7.05, + "learning_rate": 1.63755048370433e-05, + "loss": 0.5803, + "step": 8343, + "task_loss": 0.6856171488761902 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.414203941822052, + "epoch": 7.05, + "learning_rate": 1.6370808678500988e-05, + "loss": 0.5955, + "step": 8344, + "task_loss": 0.8223376274108887 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5863357782363892, + "epoch": 7.05, + "learning_rate": 1.6366112519958675e-05, + "loss": 0.6269, + "step": 8345, + "task_loss": 0.8799035549163818 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.41599979996681213, + "epoch": 7.05, + "learning_rate": 1.636141636141636e-05, + "loss": 0.5447, + "step": 8346, + "task_loss": 1.122682809829712 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.776012122631073, + "epoch": 7.06, + "learning_rate": 1.6356720202874047e-05, + "loss": 0.5992, + "step": 8347, + "task_loss": 0.962998628616333 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3335683047771454, + "epoch": 7.06, + "learning_rate": 1.6352024044331737e-05, + "loss": 0.655, + "step": 8348, + "task_loss": 0.3722030818462372 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0464450120925903, + "epoch": 7.06, + "learning_rate": 1.6347327885789427e-05, + "loss": 0.7909, + "step": 8349, + "task_loss": 1.3056317567825317 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9086690545082092, + "epoch": 7.06, + "learning_rate": 1.6342631727247113e-05, + "loss": 0.6709, + "step": 8350, + "task_loss": 1.1190757751464844 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5207901000976562, + "epoch": 7.06, + "learning_rate": 1.6337935568704803e-05, + "loss": 0.6011, + "step": 8351, + "task_loss": 0.8883943557739258 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7488521337509155, + "epoch": 7.06, + "learning_rate": 1.633323941016249e-05, + "loss": 0.7343, + "step": 8352, + "task_loss": 0.7539775371551514 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6022988557815552, + "epoch": 7.06, + "learning_rate": 1.6328543251620175e-05, + "loss": 0.4697, + "step": 8353, + "task_loss": 0.47024282813072205 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8555600643157959, + "epoch": 7.06, + "learning_rate": 1.6323847093077862e-05, + "loss": 0.7272, + "step": 8354, + "task_loss": 0.8987892270088196 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7761326432228088, + "epoch": 7.06, + "learning_rate": 1.631915093453555e-05, + "loss": 0.7227, + "step": 8355, + "task_loss": 0.26742130517959595 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4684299826622009, + "epoch": 7.06, + "learning_rate": 1.6314454775993238e-05, + "loss": 0.5858, + "step": 8356, + "task_loss": 0.9224099516868591 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6391803026199341, + "epoch": 7.06, + "learning_rate": 1.6309758617450928e-05, + "loss": 0.6876, + "step": 8357, + "task_loss": 1.1253814697265625 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5158731937408447, + "epoch": 7.07, + "learning_rate": 1.6305062458908614e-05, + "loss": 0.5682, + "step": 8358, + "task_loss": 1.0767371654510498 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5337636470794678, + "epoch": 7.07, + "learning_rate": 1.63003663003663e-05, + "loss": 0.672, + "step": 8359, + "task_loss": 0.8821613788604736 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.503973126411438, + "epoch": 7.07, + "learning_rate": 1.6295670141823987e-05, + "loss": 0.464, + "step": 8360, + "task_loss": 0.9670208096504211 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6689848303794861, + "epoch": 7.07, + "learning_rate": 1.6290973983281676e-05, + "loss": 0.6229, + "step": 8361, + "task_loss": 1.2079135179519653 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.502200722694397, + "epoch": 7.07, + "learning_rate": 1.6286277824739363e-05, + "loss": 0.5409, + "step": 8362, + "task_loss": 0.16884960234165192 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5786138772964478, + "epoch": 7.07, + "learning_rate": 1.6281581666197052e-05, + "loss": 0.6795, + "step": 8363, + "task_loss": 0.13145966827869415 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7672156691551208, + "epoch": 7.07, + "learning_rate": 1.6276885507654742e-05, + "loss": 0.6615, + "step": 8364, + "task_loss": 0.8369729518890381 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.35716062784194946, + "epoch": 7.07, + "learning_rate": 1.6272189349112425e-05, + "loss": 0.5238, + "step": 8365, + "task_loss": 0.6515402793884277 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.517238438129425, + "epoch": 7.07, + "learning_rate": 1.6267493190570115e-05, + "loss": 0.5193, + "step": 8366, + "task_loss": 0.23876838386058807 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3692840337753296, + "epoch": 7.07, + "learning_rate": 1.62627970320278e-05, + "loss": 0.6118, + "step": 8367, + "task_loss": 0.14951294660568237 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.46400558948516846, + "epoch": 7.07, + "learning_rate": 1.625810087348549e-05, + "loss": 0.5629, + "step": 8368, + "task_loss": 0.8510369062423706 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6930915713310242, + "epoch": 7.07, + "learning_rate": 1.6253404714943177e-05, + "loss": 0.6677, + "step": 8369, + "task_loss": 0.7296593189239502 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5029653310775757, + "epoch": 7.08, + "learning_rate": 1.6248708556400867e-05, + "loss": 0.7108, + "step": 8370, + "task_loss": 0.5812140107154846 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8901102542877197, + "epoch": 7.08, + "learning_rate": 1.6244012397858553e-05, + "loss": 0.698, + "step": 8371, + "task_loss": 1.337058424949646 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5888836979866028, + "epoch": 7.08, + "learning_rate": 1.623931623931624e-05, + "loss": 0.7048, + "step": 8372, + "task_loss": 1.8034273386001587 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6505317687988281, + "epoch": 7.08, + "learning_rate": 1.6234620080773926e-05, + "loss": 0.7384, + "step": 8373, + "task_loss": 1.353248953819275 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6936500668525696, + "epoch": 7.08, + "learning_rate": 1.6229923922231616e-05, + "loss": 0.7582, + "step": 8374, + "task_loss": 0.4444423019886017 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6090441346168518, + "epoch": 7.08, + "learning_rate": 1.6225227763689302e-05, + "loss": 0.6093, + "step": 8375, + "task_loss": 0.14551401138305664 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.54723060131073, + "epoch": 7.08, + "learning_rate": 1.6220531605146992e-05, + "loss": 0.6566, + "step": 8376, + "task_loss": 1.1575136184692383 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5637768507003784, + "epoch": 7.08, + "learning_rate": 1.6215835446604678e-05, + "loss": 0.6548, + "step": 8377, + "task_loss": 0.6428893208503723 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6288415193557739, + "epoch": 7.08, + "learning_rate": 1.6211139288062364e-05, + "loss": 0.6357, + "step": 8378, + "task_loss": 0.8283109664916992 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7112703919410706, + "epoch": 7.08, + "learning_rate": 1.6206443129520054e-05, + "loss": 0.7574, + "step": 8379, + "task_loss": 1.2716329097747803 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8259702920913696, + "epoch": 7.08, + "learning_rate": 1.620174697097774e-05, + "loss": 0.6221, + "step": 8380, + "task_loss": 2.2903759479522705 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6075233221054077, + "epoch": 7.08, + "learning_rate": 1.619705081243543e-05, + "loss": 0.6167, + "step": 8381, + "task_loss": 1.2559176683425903 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.932050347328186, + "epoch": 7.09, + "learning_rate": 1.6192354653893117e-05, + "loss": 0.7656, + "step": 8382, + "task_loss": 0.9545952081680298 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3104938566684723, + "epoch": 7.09, + "learning_rate": 1.6187658495350806e-05, + "loss": 0.5944, + "step": 8383, + "task_loss": 1.149950385093689 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7924899458885193, + "epoch": 7.09, + "learning_rate": 1.6182962336808493e-05, + "loss": 0.5242, + "step": 8384, + "task_loss": 1.4406886100769043 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4501177668571472, + "epoch": 7.09, + "learning_rate": 1.617826617826618e-05, + "loss": 0.6871, + "step": 8385, + "task_loss": 0.4796721041202545 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.40108585357666016, + "epoch": 7.09, + "learning_rate": 1.6173570019723865e-05, + "loss": 0.6089, + "step": 8386, + "task_loss": 0.49110931158065796 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0173066854476929, + "epoch": 7.09, + "learning_rate": 1.6168873861181555e-05, + "loss": 0.722, + "step": 8387, + "task_loss": 0.6607133746147156 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.48241448402404785, + "epoch": 7.09, + "learning_rate": 1.616417770263924e-05, + "loss": 0.6044, + "step": 8388, + "task_loss": 0.43951845169067383 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7578156590461731, + "epoch": 7.09, + "learning_rate": 1.615948154409693e-05, + "loss": 0.5152, + "step": 8389, + "task_loss": 1.5265436172485352 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.773209810256958, + "epoch": 7.09, + "learning_rate": 1.6154785385554617e-05, + "loss": 0.6822, + "step": 8390, + "task_loss": 0.5898456573486328 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.069957971572876, + "epoch": 7.09, + "learning_rate": 1.6150089227012304e-05, + "loss": 0.7045, + "step": 8391, + "task_loss": 1.425801396369934 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7311309576034546, + "epoch": 7.09, + "learning_rate": 1.614539306846999e-05, + "loss": 0.6637, + "step": 8392, + "task_loss": 0.9855448007583618 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6902347803115845, + "epoch": 7.09, + "learning_rate": 1.614069690992768e-05, + "loss": 0.8205, + "step": 8393, + "task_loss": 0.6385382413864136 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5805254578590393, + "epoch": 7.1, + "learning_rate": 1.613600075138537e-05, + "loss": 0.6405, + "step": 8394, + "task_loss": 1.0978924036026 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4242471158504486, + "epoch": 7.1, + "learning_rate": 1.6131304592843056e-05, + "loss": 0.5221, + "step": 8395, + "task_loss": 0.236627995967865 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7234374284744263, + "epoch": 7.1, + "learning_rate": 1.6126608434300746e-05, + "loss": 0.7463, + "step": 8396, + "task_loss": 0.532874345779419 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5922473669052124, + "epoch": 7.1, + "learning_rate": 1.612191227575843e-05, + "loss": 0.6336, + "step": 8397, + "task_loss": 0.9460464119911194 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8816364407539368, + "epoch": 7.1, + "learning_rate": 1.6117216117216118e-05, + "loss": 0.5933, + "step": 8398, + "task_loss": 1.5511754751205444 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7044150233268738, + "epoch": 7.1, + "learning_rate": 1.6112519958673805e-05, + "loss": 0.8032, + "step": 8399, + "task_loss": 1.2868694067001343 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7648730278015137, + "epoch": 7.1, + "learning_rate": 1.6107823800131494e-05, + "loss": 0.8261, + "step": 8400, + "task_loss": 0.5242278575897217 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7344472408294678, + "epoch": 7.1, + "learning_rate": 1.610312764158918e-05, + "loss": 0.6747, + "step": 8401, + "task_loss": 0.5974429249763489 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7975547909736633, + "epoch": 7.1, + "learning_rate": 1.609843148304687e-05, + "loss": 0.672, + "step": 8402, + "task_loss": 1.1172325611114502 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4616832733154297, + "epoch": 7.1, + "learning_rate": 1.6093735324504557e-05, + "loss": 0.7492, + "step": 8403, + "task_loss": 0.7724802494049072 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5627351999282837, + "epoch": 7.1, + "learning_rate": 1.6089039165962243e-05, + "loss": 0.7803, + "step": 8404, + "task_loss": 0.26859793066978455 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5410542488098145, + "epoch": 7.1, + "learning_rate": 1.608434300741993e-05, + "loss": 0.5934, + "step": 8405, + "task_loss": 0.7923789620399475 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.48258042335510254, + "epoch": 7.11, + "learning_rate": 1.607964684887762e-05, + "loss": 0.8418, + "step": 8406, + "task_loss": 1.0501539707183838 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5604144334793091, + "epoch": 7.11, + "learning_rate": 1.6074950690335306e-05, + "loss": 0.6774, + "step": 8407, + "task_loss": 0.9660919904708862 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.864712119102478, + "epoch": 7.11, + "learning_rate": 1.6070254531792995e-05, + "loss": 0.7266, + "step": 8408, + "task_loss": 1.918562650680542 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5800137519836426, + "epoch": 7.11, + "learning_rate": 1.606555837325068e-05, + "loss": 0.5854, + "step": 8409, + "task_loss": 1.0101346969604492 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3330104649066925, + "epoch": 7.11, + "learning_rate": 1.6060862214708368e-05, + "loss": 0.5226, + "step": 8410, + "task_loss": 0.3834267556667328 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6610935926437378, + "epoch": 7.11, + "learning_rate": 1.6056166056166058e-05, + "loss": 0.6349, + "step": 8411, + "task_loss": 0.2818596065044403 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5351617932319641, + "epoch": 7.11, + "learning_rate": 1.6051469897623744e-05, + "loss": 0.5172, + "step": 8412, + "task_loss": 0.3360154926776886 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7500707507133484, + "epoch": 7.11, + "learning_rate": 1.6046773739081434e-05, + "loss": 0.708, + "step": 8413, + "task_loss": 1.231766700744629 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8463991284370422, + "epoch": 7.11, + "learning_rate": 1.604207758053912e-05, + "loss": 0.8555, + "step": 8414, + "task_loss": 0.7372391819953918 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6870501041412354, + "epoch": 7.11, + "learning_rate": 1.603738142199681e-05, + "loss": 0.6995, + "step": 8415, + "task_loss": 0.45287737250328064 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3237491548061371, + "epoch": 7.11, + "learning_rate": 1.6032685263454493e-05, + "loss": 0.4813, + "step": 8416, + "task_loss": 0.664273202419281 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5925995111465454, + "epoch": 7.11, + "learning_rate": 1.6027989104912182e-05, + "loss": 0.5193, + "step": 8417, + "task_loss": 0.8665277361869812 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5214895606040955, + "epoch": 7.12, + "learning_rate": 1.602329294636987e-05, + "loss": 0.5948, + "step": 8418, + "task_loss": 0.4023206830024719 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6630814075469971, + "epoch": 7.12, + "learning_rate": 1.601859678782756e-05, + "loss": 0.7933, + "step": 8419, + "task_loss": 0.6199644207954407 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4708345830440521, + "epoch": 7.12, + "learning_rate": 1.6013900629285245e-05, + "loss": 0.5506, + "step": 8420, + "task_loss": 0.6653976440429688 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8355787992477417, + "epoch": 7.12, + "learning_rate": 1.6009204470742935e-05, + "loss": 0.6807, + "step": 8421, + "task_loss": 1.0640462636947632 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.154185175895691, + "epoch": 7.12, + "learning_rate": 1.600450831220062e-05, + "loss": 0.7133, + "step": 8422, + "task_loss": 1.1090424060821533 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6231474280357361, + "epoch": 7.12, + "learning_rate": 1.5999812153658307e-05, + "loss": 0.8241, + "step": 8423, + "task_loss": 0.6542881727218628 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.54659104347229, + "epoch": 7.12, + "learning_rate": 1.5995115995115994e-05, + "loss": 0.7338, + "step": 8424, + "task_loss": 0.19616270065307617 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5259149074554443, + "epoch": 7.12, + "learning_rate": 1.5990419836573683e-05, + "loss": 0.6305, + "step": 8425, + "task_loss": 1.3076406717300415 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3779906630516052, + "epoch": 7.12, + "learning_rate": 1.5985723678031373e-05, + "loss": 0.5903, + "step": 8426, + "task_loss": 0.4189998209476471 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.40941327810287476, + "epoch": 7.12, + "learning_rate": 1.598102751948906e-05, + "loss": 0.5589, + "step": 8427, + "task_loss": 0.6184937953948975 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5693541765213013, + "epoch": 7.12, + "learning_rate": 1.5976331360946746e-05, + "loss": 0.7528, + "step": 8428, + "task_loss": 0.649507462978363 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.128657579421997, + "epoch": 7.13, + "learning_rate": 1.5971635202404432e-05, + "loss": 0.8458, + "step": 8429, + "task_loss": 1.6795547008514404 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6948497295379639, + "epoch": 7.13, + "learning_rate": 1.5966939043862122e-05, + "loss": 0.6684, + "step": 8430, + "task_loss": 1.08279287815094 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4222497344017029, + "epoch": 7.13, + "learning_rate": 1.5962242885319808e-05, + "loss": 0.6892, + "step": 8431, + "task_loss": 0.14490440487861633 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8063434958457947, + "epoch": 7.13, + "learning_rate": 1.5957546726777498e-05, + "loss": 0.6041, + "step": 8432, + "task_loss": 0.9191286563873291 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7302582263946533, + "epoch": 7.13, + "learning_rate": 1.5952850568235184e-05, + "loss": 0.5793, + "step": 8433, + "task_loss": 0.8220800757408142 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8247674703598022, + "epoch": 7.13, + "learning_rate": 1.5948154409692874e-05, + "loss": 0.58, + "step": 8434, + "task_loss": 1.2602459192276 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.33323851227760315, + "epoch": 7.13, + "learning_rate": 1.594345825115056e-05, + "loss": 0.5572, + "step": 8435, + "task_loss": 0.5495594143867493 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.45829302072525024, + "epoch": 7.13, + "learning_rate": 1.5938762092608247e-05, + "loss": 0.6407, + "step": 8436, + "task_loss": 0.31376197934150696 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6748392581939697, + "epoch": 7.13, + "learning_rate": 1.5934065934065933e-05, + "loss": 0.744, + "step": 8437, + "task_loss": 0.6346133351325989 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7275195121765137, + "epoch": 7.13, + "learning_rate": 1.5929369775523623e-05, + "loss": 0.774, + "step": 8438, + "task_loss": 1.1711560487747192 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4574977159500122, + "epoch": 7.13, + "learning_rate": 1.592467361698131e-05, + "loss": 0.4762, + "step": 8439, + "task_loss": 0.46302270889282227 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6328120231628418, + "epoch": 7.13, + "learning_rate": 1.5919977458439e-05, + "loss": 0.5774, + "step": 8440, + "task_loss": 0.7905809879302979 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6379467844963074, + "epoch": 7.14, + "learning_rate": 1.5915281299896685e-05, + "loss": 0.9362, + "step": 8441, + "task_loss": 1.0344184637069702 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7461031675338745, + "epoch": 7.14, + "learning_rate": 1.591058514135437e-05, + "loss": 0.667, + "step": 8442, + "task_loss": 1.4348578453063965 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6148496270179749, + "epoch": 7.14, + "learning_rate": 1.590588898281206e-05, + "loss": 0.764, + "step": 8443, + "task_loss": 0.8455683588981628 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3376334011554718, + "epoch": 7.14, + "learning_rate": 1.5901192824269748e-05, + "loss": 0.647, + "step": 8444, + "task_loss": 0.19927376508712769 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9140491485595703, + "epoch": 7.14, + "learning_rate": 1.5896496665727437e-05, + "loss": 0.85, + "step": 8445, + "task_loss": 1.9344511032104492 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3607456088066101, + "epoch": 7.14, + "learning_rate": 1.5891800507185124e-05, + "loss": 0.7155, + "step": 8446, + "task_loss": 0.8040563464164734 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9077380299568176, + "epoch": 7.14, + "learning_rate": 1.5887104348642813e-05, + "loss": 0.5832, + "step": 8447, + "task_loss": 1.0322580337524414 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7491998076438904, + "epoch": 7.14, + "learning_rate": 1.5882408190100496e-05, + "loss": 0.6584, + "step": 8448, + "task_loss": 1.1641910076141357 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5760918855667114, + "epoch": 7.14, + "learning_rate": 1.5877712031558186e-05, + "loss": 0.6219, + "step": 8449, + "task_loss": 0.26931309700012207 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4091125726699829, + "epoch": 7.14, + "learning_rate": 1.5873015873015872e-05, + "loss": 0.581, + "step": 8450, + "task_loss": 0.8228166699409485 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6835980415344238, + "epoch": 7.14, + "learning_rate": 1.5868319714473562e-05, + "loss": 0.6247, + "step": 8451, + "task_loss": 0.39903631806373596 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.681440532207489, + "epoch": 7.14, + "learning_rate": 1.586362355593125e-05, + "loss": 0.5343, + "step": 8452, + "task_loss": 0.3633407652378082 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6754559874534607, + "epoch": 7.15, + "learning_rate": 1.5858927397388938e-05, + "loss": 0.5383, + "step": 8453, + "task_loss": 0.3753546476364136 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4289522171020508, + "epoch": 7.15, + "learning_rate": 1.5854231238846624e-05, + "loss": 0.5601, + "step": 8454, + "task_loss": 0.7807856798171997 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7678532600402832, + "epoch": 7.15, + "learning_rate": 1.584953508030431e-05, + "loss": 0.718, + "step": 8455, + "task_loss": 0.7275285720825195 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4696681797504425, + "epoch": 7.15, + "learning_rate": 1.5844838921762e-05, + "loss": 0.5792, + "step": 8456, + "task_loss": 0.4685693085193634 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4084080457687378, + "epoch": 7.15, + "learning_rate": 1.5840142763219687e-05, + "loss": 0.7548, + "step": 8457, + "task_loss": 1.0007113218307495 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5539934635162354, + "epoch": 7.15, + "learning_rate": 1.5835446604677377e-05, + "loss": 0.5574, + "step": 8458, + "task_loss": 0.423354834318161 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8159828186035156, + "epoch": 7.15, + "learning_rate": 1.5830750446135063e-05, + "loss": 0.6891, + "step": 8459, + "task_loss": 0.5290579199790955 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4651435911655426, + "epoch": 7.15, + "learning_rate": 1.582605428759275e-05, + "loss": 0.8403, + "step": 8460, + "task_loss": 0.23370493948459625 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3765687346458435, + "epoch": 7.15, + "learning_rate": 1.5821358129050436e-05, + "loss": 0.5758, + "step": 8461, + "task_loss": 0.6155165433883667 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.357231855392456, + "epoch": 7.15, + "learning_rate": 1.5816661970508125e-05, + "loss": 0.7839, + "step": 8462, + "task_loss": 0.6769721508026123 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.3061888217926025, + "epoch": 7.15, + "learning_rate": 1.581196581196581e-05, + "loss": 0.8217, + "step": 8463, + "task_loss": 1.1904014348983765 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7457218170166016, + "epoch": 7.15, + "learning_rate": 1.58072696534235e-05, + "loss": 0.5128, + "step": 8464, + "task_loss": 0.7859249114990234 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3024817109107971, + "epoch": 7.16, + "learning_rate": 1.5802573494881188e-05, + "loss": 0.6329, + "step": 8465, + "task_loss": 0.831601083278656 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.28634271025657654, + "epoch": 7.16, + "learning_rate": 1.5797877336338877e-05, + "loss": 0.418, + "step": 8466, + "task_loss": 0.1140139102935791 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.39979425072669983, + "epoch": 7.16, + "learning_rate": 1.5793181177796564e-05, + "loss": 0.4887, + "step": 8467, + "task_loss": 0.555949330329895 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6898396015167236, + "epoch": 7.16, + "learning_rate": 1.578848501925425e-05, + "loss": 0.6519, + "step": 8468, + "task_loss": 0.49919649958610535 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0725054740905762, + "epoch": 7.16, + "learning_rate": 1.5783788860711936e-05, + "loss": 0.6297, + "step": 8469, + "task_loss": 1.0409464836120605 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.35134634375572205, + "epoch": 7.16, + "learning_rate": 1.5779092702169626e-05, + "loss": 0.5515, + "step": 8470, + "task_loss": 0.1648109257221222 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.92381352186203, + "epoch": 7.16, + "learning_rate": 1.5774396543627313e-05, + "loss": 0.7499, + "step": 8471, + "task_loss": 0.49265220761299133 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7071285247802734, + "epoch": 7.16, + "learning_rate": 1.5769700385085002e-05, + "loss": 0.6543, + "step": 8472, + "task_loss": 0.687382698059082 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6065130829811096, + "epoch": 7.16, + "learning_rate": 1.576500422654269e-05, + "loss": 0.6698, + "step": 8473, + "task_loss": 0.5867818593978882 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6493736505508423, + "epoch": 7.16, + "learning_rate": 1.5760308068000375e-05, + "loss": 0.6903, + "step": 8474, + "task_loss": 1.305192232131958 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.539053738117218, + "epoch": 7.16, + "learning_rate": 1.5755611909458065e-05, + "loss": 0.5821, + "step": 8475, + "task_loss": 0.9122236967086792 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5077470541000366, + "epoch": 7.16, + "learning_rate": 1.575091575091575e-05, + "loss": 0.6549, + "step": 8476, + "task_loss": 0.6650866270065308 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.47042301297187805, + "epoch": 7.17, + "learning_rate": 1.574621959237344e-05, + "loss": 0.809, + "step": 8477, + "task_loss": 0.9839155673980713 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6949329972267151, + "epoch": 7.17, + "learning_rate": 1.5741523433831127e-05, + "loss": 0.8172, + "step": 8478, + "task_loss": 1.3325800895690918 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5441287159919739, + "epoch": 7.17, + "learning_rate": 1.5736827275288817e-05, + "loss": 0.6818, + "step": 8479, + "task_loss": 0.9120951890945435 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6307831406593323, + "epoch": 7.17, + "learning_rate": 1.57321311167465e-05, + "loss": 0.6581, + "step": 8480, + "task_loss": 0.6517413258552551 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.43103671073913574, + "epoch": 7.17, + "learning_rate": 1.572743495820419e-05, + "loss": 0.5377, + "step": 8481, + "task_loss": 0.5408791899681091 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7880616188049316, + "epoch": 7.17, + "learning_rate": 1.5722738799661876e-05, + "loss": 0.7546, + "step": 8482, + "task_loss": 1.2986208200454712 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.215588927268982, + "epoch": 7.17, + "learning_rate": 1.5718042641119566e-05, + "loss": 0.803, + "step": 8483, + "task_loss": 2.5933656692504883 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8796892166137695, + "epoch": 7.17, + "learning_rate": 1.5713346482577252e-05, + "loss": 0.6967, + "step": 8484, + "task_loss": 0.24904394149780273 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.47419190406799316, + "epoch": 7.17, + "learning_rate": 1.570865032403494e-05, + "loss": 0.7735, + "step": 8485, + "task_loss": 0.5119695663452148 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4397846758365631, + "epoch": 7.17, + "learning_rate": 1.5703954165492628e-05, + "loss": 0.6287, + "step": 8486, + "task_loss": 0.36134228110313416 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4957908093929291, + "epoch": 7.17, + "learning_rate": 1.5699258006950314e-05, + "loss": 0.6918, + "step": 8487, + "task_loss": 0.4896157383918762 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4418303370475769, + "epoch": 7.17, + "learning_rate": 1.5694561848408004e-05, + "loss": 0.5859, + "step": 8488, + "task_loss": 0.5154657363891602 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6294084787368774, + "epoch": 7.18, + "learning_rate": 1.568986568986569e-05, + "loss": 0.735, + "step": 8489, + "task_loss": 0.41483432054519653 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5685089230537415, + "epoch": 7.18, + "learning_rate": 1.568516953132338e-05, + "loss": 0.6569, + "step": 8490, + "task_loss": 0.3258821666240692 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0115339756011963, + "epoch": 7.18, + "learning_rate": 1.5680473372781066e-05, + "loss": 0.7211, + "step": 8491, + "task_loss": 1.0324640274047852 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4696957767009735, + "epoch": 7.18, + "learning_rate": 1.5675777214238753e-05, + "loss": 0.6303, + "step": 8492, + "task_loss": 0.4900047183036804 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.4366559982299805, + "epoch": 7.18, + "learning_rate": 1.567108105569644e-05, + "loss": 0.7852, + "step": 8493, + "task_loss": 0.6420791745185852 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7708212733268738, + "epoch": 7.18, + "learning_rate": 1.566638489715413e-05, + "loss": 0.6284, + "step": 8494, + "task_loss": 1.3581048250198364 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7083960175514221, + "epoch": 7.18, + "learning_rate": 1.5661688738611815e-05, + "loss": 0.633, + "step": 8495, + "task_loss": 0.3804129362106323 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.31048887968063354, + "epoch": 7.18, + "learning_rate": 1.5656992580069505e-05, + "loss": 0.4934, + "step": 8496, + "task_loss": 0.03558212146162987 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.31715330481529236, + "epoch": 7.18, + "learning_rate": 1.565229642152719e-05, + "loss": 0.7615, + "step": 8497, + "task_loss": 0.09065178036689758 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6127007007598877, + "epoch": 7.18, + "learning_rate": 1.564760026298488e-05, + "loss": 0.6427, + "step": 8498, + "task_loss": 0.2927553355693817 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6079705953598022, + "epoch": 7.18, + "learning_rate": 1.5642904104442564e-05, + "loss": 0.7487, + "step": 8499, + "task_loss": 1.2215237617492676 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8183196187019348, + "epoch": 7.19, + "learning_rate": 1.5638207945900254e-05, + "loss": 0.5683, + "step": 8500, + "task_loss": 0.743514895439148 + }, + { + "epoch": 7.19, + "eval_accuracy": 0.8953267326732673, + "eval_loss": 0.43779489398002625, + "eval_runtime": 227.9826, + "eval_samples_per_second": 110.754, + "eval_steps_per_second": 0.868, + "step": 8500 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.990864098072052, + "epoch": 7.19, + "learning_rate": 1.563351178735794e-05, + "loss": 0.7585, + "step": 8501, + "task_loss": 1.5002120733261108 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6665123701095581, + "epoch": 7.19, + "learning_rate": 1.562881562881563e-05, + "loss": 0.5778, + "step": 8502, + "task_loss": 0.3807893693447113 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.370889276266098, + "epoch": 7.19, + "learning_rate": 1.562411947027332e-05, + "loss": 0.5326, + "step": 8503, + "task_loss": 0.12369082123041153 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6021170616149902, + "epoch": 7.19, + "learning_rate": 1.5619423311731006e-05, + "loss": 0.7089, + "step": 8504, + "task_loss": 1.3640639781951904 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7281806468963623, + "epoch": 7.19, + "learning_rate": 1.5614727153188692e-05, + "loss": 0.6435, + "step": 8505, + "task_loss": 0.7102740406990051 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9422788619995117, + "epoch": 7.19, + "learning_rate": 1.561003099464638e-05, + "loss": 0.6493, + "step": 8506, + "task_loss": 0.6679918766021729 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8928893208503723, + "epoch": 7.19, + "learning_rate": 1.5605334836104068e-05, + "loss": 0.8878, + "step": 8507, + "task_loss": 0.8306841254234314 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6426607370376587, + "epoch": 7.19, + "learning_rate": 1.5600638677561755e-05, + "loss": 0.6971, + "step": 8508, + "task_loss": 0.1507406383752823 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6246156096458435, + "epoch": 7.19, + "learning_rate": 1.5595942519019444e-05, + "loss": 0.6238, + "step": 8509, + "task_loss": 0.6361398696899414 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7205590009689331, + "epoch": 7.19, + "learning_rate": 1.559124636047713e-05, + "loss": 0.6131, + "step": 8510, + "task_loss": 1.0189521312713623 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9742196202278137, + "epoch": 7.19, + "learning_rate": 1.5586550201934817e-05, + "loss": 0.9449, + "step": 8511, + "task_loss": 1.8274142742156982 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7933133840560913, + "epoch": 7.2, + "learning_rate": 1.5581854043392503e-05, + "loss": 0.8507, + "step": 8512, + "task_loss": 0.9836976528167725 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7427475452423096, + "epoch": 7.2, + "learning_rate": 1.5577157884850193e-05, + "loss": 0.6915, + "step": 8513, + "task_loss": 0.4305715560913086 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7178005576133728, + "epoch": 7.2, + "learning_rate": 1.557246172630788e-05, + "loss": 0.7118, + "step": 8514, + "task_loss": 0.8173565864562988 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.29163259267807007, + "epoch": 7.2, + "learning_rate": 1.556776556776557e-05, + "loss": 0.5681, + "step": 8515, + "task_loss": 0.6637154817581177 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6451959609985352, + "epoch": 7.2, + "learning_rate": 1.5563069409223255e-05, + "loss": 0.6489, + "step": 8516, + "task_loss": 0.6102973222732544 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6916806697845459, + "epoch": 7.2, + "learning_rate": 1.5558373250680945e-05, + "loss": 0.5728, + "step": 8517, + "task_loss": 0.7129629850387573 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5293452739715576, + "epoch": 7.2, + "learning_rate": 1.555367709213863e-05, + "loss": 0.529, + "step": 8518, + "task_loss": 0.3213886320590973 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3538872003555298, + "epoch": 7.2, + "learning_rate": 1.5548980933596318e-05, + "loss": 0.484, + "step": 8519, + "task_loss": 0.3541630208492279 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.46898022294044495, + "epoch": 7.2, + "learning_rate": 1.5544284775054008e-05, + "loss": 0.557, + "step": 8520, + "task_loss": 0.5379285216331482 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.711785078048706, + "epoch": 7.2, + "learning_rate": 1.5539588616511694e-05, + "loss": 0.7633, + "step": 8521, + "task_loss": 0.36310264468193054 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8124290704727173, + "epoch": 7.2, + "learning_rate": 1.5534892457969384e-05, + "loss": 0.7877, + "step": 8522, + "task_loss": 0.8821300864219666 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8072690963745117, + "epoch": 7.2, + "learning_rate": 1.553019629942707e-05, + "loss": 0.5839, + "step": 8523, + "task_loss": 0.8811415433883667 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7129310369491577, + "epoch": 7.21, + "learning_rate": 1.5525500140884756e-05, + "loss": 0.5829, + "step": 8524, + "task_loss": 0.7109659314155579 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7270245552062988, + "epoch": 7.21, + "learning_rate": 1.5520803982342443e-05, + "loss": 0.5977, + "step": 8525, + "task_loss": 0.09737304598093033 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8808525204658508, + "epoch": 7.21, + "learning_rate": 1.5516107823800132e-05, + "loss": 0.6795, + "step": 8526, + "task_loss": 0.7340739965438843 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0130929946899414, + "epoch": 7.21, + "learning_rate": 1.551141166525782e-05, + "loss": 0.7507, + "step": 8527, + "task_loss": 0.5438169836997986 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.2938401401042938, + "epoch": 7.21, + "learning_rate": 1.550671550671551e-05, + "loss": 0.7239, + "step": 8528, + "task_loss": 0.34175828099250793 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5199658274650574, + "epoch": 7.21, + "learning_rate": 1.5502019348173195e-05, + "loss": 0.6032, + "step": 8529, + "task_loss": 0.10153978317975998 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6071274876594543, + "epoch": 7.21, + "learning_rate": 1.5497323189630885e-05, + "loss": 0.7428, + "step": 8530, + "task_loss": 1.5141441822052002 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5149941444396973, + "epoch": 7.21, + "learning_rate": 1.5492627031088567e-05, + "loss": 0.6882, + "step": 8531, + "task_loss": 0.6275885105133057 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8136538863182068, + "epoch": 7.21, + "learning_rate": 1.5487930872546257e-05, + "loss": 0.6001, + "step": 8532, + "task_loss": 0.5640076994895935 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6719343066215515, + "epoch": 7.21, + "learning_rate": 1.5483234714003947e-05, + "loss": 0.7496, + "step": 8533, + "task_loss": 1.6522986888885498 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.40758007764816284, + "epoch": 7.21, + "learning_rate": 1.5478538555461633e-05, + "loss": 0.6011, + "step": 8534, + "task_loss": 0.13265427947044373 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.976730227470398, + "epoch": 7.21, + "learning_rate": 1.5473842396919323e-05, + "loss": 0.6599, + "step": 8535, + "task_loss": 0.9018559455871582 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6443747282028198, + "epoch": 7.22, + "learning_rate": 1.546914623837701e-05, + "loss": 0.6681, + "step": 8536, + "task_loss": 0.5526732206344604 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9762014746665955, + "epoch": 7.22, + "learning_rate": 1.5464450079834696e-05, + "loss": 0.6616, + "step": 8537, + "task_loss": 0.34834176301956177 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4865655303001404, + "epoch": 7.22, + "learning_rate": 1.5459753921292382e-05, + "loss": 0.6129, + "step": 8538, + "task_loss": 0.5381414890289307 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.49180179834365845, + "epoch": 7.22, + "learning_rate": 1.5455057762750072e-05, + "loss": 0.6005, + "step": 8539, + "task_loss": 1.3672940731048584 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7183713912963867, + "epoch": 7.22, + "learning_rate": 1.5450361604207758e-05, + "loss": 0.5527, + "step": 8540, + "task_loss": 0.7811440229415894 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5699954032897949, + "epoch": 7.22, + "learning_rate": 1.5445665445665448e-05, + "loss": 0.6313, + "step": 8541, + "task_loss": 0.3669542968273163 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6848151683807373, + "epoch": 7.22, + "learning_rate": 1.5440969287123134e-05, + "loss": 0.6532, + "step": 8542, + "task_loss": 1.2367814779281616 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8387200236320496, + "epoch": 7.22, + "learning_rate": 1.543627312858082e-05, + "loss": 0.7712, + "step": 8543, + "task_loss": 0.5017297267913818 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7719025611877441, + "epoch": 7.22, + "learning_rate": 1.5431576970038507e-05, + "loss": 0.6654, + "step": 8544, + "task_loss": 0.5082730054855347 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4586191475391388, + "epoch": 7.22, + "learning_rate": 1.5426880811496197e-05, + "loss": 0.5572, + "step": 8545, + "task_loss": 0.37065306305885315 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7843990325927734, + "epoch": 7.22, + "learning_rate": 1.5422184652953883e-05, + "loss": 0.7177, + "step": 8546, + "task_loss": 1.9631074666976929 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6040240526199341, + "epoch": 7.22, + "learning_rate": 1.5417488494411573e-05, + "loss": 0.6309, + "step": 8547, + "task_loss": 0.6570003032684326 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5048094987869263, + "epoch": 7.23, + "learning_rate": 1.541279233586926e-05, + "loss": 0.5276, + "step": 8548, + "task_loss": 0.6570563912391663 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7801402807235718, + "epoch": 7.23, + "learning_rate": 1.540809617732695e-05, + "loss": 0.6239, + "step": 8549, + "task_loss": 0.6335853934288025 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5466333627700806, + "epoch": 7.23, + "learning_rate": 1.5403400018784635e-05, + "loss": 0.5655, + "step": 8550, + "task_loss": 0.58831787109375 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7508009076118469, + "epoch": 7.23, + "learning_rate": 1.539870386024232e-05, + "loss": 0.7145, + "step": 8551, + "task_loss": 0.9097232222557068 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8376821875572205, + "epoch": 7.23, + "learning_rate": 1.539400770170001e-05, + "loss": 0.7567, + "step": 8552, + "task_loss": 1.0124176740646362 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5884583592414856, + "epoch": 7.23, + "learning_rate": 1.5389311543157697e-05, + "loss": 0.6158, + "step": 8553, + "task_loss": 0.4630068838596344 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5653434991836548, + "epoch": 7.23, + "learning_rate": 1.5384615384615387e-05, + "loss": 0.5671, + "step": 8554, + "task_loss": 0.3201290965080261 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6270184516906738, + "epoch": 7.23, + "learning_rate": 1.5379919226073074e-05, + "loss": 0.686, + "step": 8555, + "task_loss": 0.6143503189086914 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4465900659561157, + "epoch": 7.23, + "learning_rate": 1.537522306753076e-05, + "loss": 0.6051, + "step": 8556, + "task_loss": 0.6222668290138245 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4419277012348175, + "epoch": 7.23, + "learning_rate": 1.5370526908988446e-05, + "loss": 0.6918, + "step": 8557, + "task_loss": 1.2409132719039917 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4639429748058319, + "epoch": 7.23, + "learning_rate": 1.5365830750446136e-05, + "loss": 0.5318, + "step": 8558, + "task_loss": 0.3866361677646637 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.45016348361968994, + "epoch": 7.23, + "learning_rate": 1.5361134591903822e-05, + "loss": 0.5282, + "step": 8559, + "task_loss": 0.9374175667762756 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5793968439102173, + "epoch": 7.24, + "learning_rate": 1.5356438433361512e-05, + "loss": 0.6481, + "step": 8560, + "task_loss": 0.47957471013069153 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4076997935771942, + "epoch": 7.24, + "learning_rate": 1.53517422748192e-05, + "loss": 0.7701, + "step": 8561, + "task_loss": 0.902180552482605 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7959674596786499, + "epoch": 7.24, + "learning_rate": 1.5347046116276888e-05, + "loss": 0.8035, + "step": 8562, + "task_loss": 0.559281051158905 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.33718767762184143, + "epoch": 7.24, + "learning_rate": 1.534234995773457e-05, + "loss": 0.5085, + "step": 8563, + "task_loss": 0.1344453990459442 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7191209197044373, + "epoch": 7.24, + "learning_rate": 1.533765379919226e-05, + "loss": 0.7322, + "step": 8564, + "task_loss": 1.3293659687042236 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7134954929351807, + "epoch": 7.24, + "learning_rate": 1.533295764064995e-05, + "loss": 0.759, + "step": 8565, + "task_loss": 0.8483378887176514 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7420100569725037, + "epoch": 7.24, + "learning_rate": 1.5328261482107637e-05, + "loss": 0.7333, + "step": 8566, + "task_loss": 0.6308411955833435 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.35758548974990845, + "epoch": 7.24, + "learning_rate": 1.5323565323565327e-05, + "loss": 0.5551, + "step": 8567, + "task_loss": 0.4732878506183624 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4367782771587372, + "epoch": 7.24, + "learning_rate": 1.5318869165023013e-05, + "loss": 0.575, + "step": 8568, + "task_loss": 0.3457781672477722 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5821860432624817, + "epoch": 7.24, + "learning_rate": 1.53141730064807e-05, + "loss": 0.563, + "step": 8569, + "task_loss": 0.8253406286239624 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0024961233139038, + "epoch": 7.24, + "learning_rate": 1.5309476847938386e-05, + "loss": 0.6662, + "step": 8570, + "task_loss": 0.9171218872070312 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5770313739776611, + "epoch": 7.24, + "learning_rate": 1.5304780689396075e-05, + "loss": 0.6166, + "step": 8571, + "task_loss": 0.8891733288764954 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5433895587921143, + "epoch": 7.25, + "learning_rate": 1.530008453085376e-05, + "loss": 0.5547, + "step": 8572, + "task_loss": 0.22213652729988098 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4512890875339508, + "epoch": 7.25, + "learning_rate": 1.529538837231145e-05, + "loss": 0.6617, + "step": 8573, + "task_loss": 1.490756630897522 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6963578462600708, + "epoch": 7.25, + "learning_rate": 1.5290692213769138e-05, + "loss": 0.7615, + "step": 8574, + "task_loss": 0.9361996650695801 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6843407154083252, + "epoch": 7.25, + "learning_rate": 1.5285996055226824e-05, + "loss": 0.7653, + "step": 8575, + "task_loss": 1.0033915042877197 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.701684832572937, + "epoch": 7.25, + "learning_rate": 1.528129989668451e-05, + "loss": 0.7899, + "step": 8576, + "task_loss": 0.13881148397922516 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5214868783950806, + "epoch": 7.25, + "learning_rate": 1.52766037381422e-05, + "loss": 0.5854, + "step": 8577, + "task_loss": 0.21535342931747437 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0778827667236328, + "epoch": 7.25, + "learning_rate": 1.5271907579599886e-05, + "loss": 0.7158, + "step": 8578, + "task_loss": 1.9342153072357178 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3943029046058655, + "epoch": 7.25, + "learning_rate": 1.5267211421057576e-05, + "loss": 0.7071, + "step": 8579, + "task_loss": 0.5509730577468872 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0713502168655396, + "epoch": 7.25, + "learning_rate": 1.5262515262515266e-05, + "loss": 0.7518, + "step": 8580, + "task_loss": 1.2229689359664917 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5968737006187439, + "epoch": 7.25, + "learning_rate": 1.525781910397295e-05, + "loss": 0.5689, + "step": 8581, + "task_loss": 0.6183202266693115 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8938381671905518, + "epoch": 7.25, + "learning_rate": 1.525312294543064e-05, + "loss": 0.7262, + "step": 8582, + "task_loss": 1.263463020324707 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4779461622238159, + "epoch": 7.26, + "learning_rate": 1.5248426786888325e-05, + "loss": 0.6892, + "step": 8583, + "task_loss": 0.494126558303833 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.095337152481079, + "epoch": 7.26, + "learning_rate": 1.5243730628346015e-05, + "loss": 0.6361, + "step": 8584, + "task_loss": 1.1588629484176636 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.199995756149292, + "epoch": 7.26, + "learning_rate": 1.5239034469803701e-05, + "loss": 0.5409, + "step": 8585, + "task_loss": 0.25120800733566284 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7136541604995728, + "epoch": 7.26, + "learning_rate": 1.5234338311261389e-05, + "loss": 0.6815, + "step": 8586, + "task_loss": 2.2232882976531982 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4857661724090576, + "epoch": 7.26, + "learning_rate": 1.5229642152719075e-05, + "loss": 0.6437, + "step": 8587, + "task_loss": 0.1623697578907013 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8200966715812683, + "epoch": 7.26, + "learning_rate": 1.5224945994176765e-05, + "loss": 0.5532, + "step": 8588, + "task_loss": 0.6997692584991455 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8869998455047607, + "epoch": 7.26, + "learning_rate": 1.5220249835634451e-05, + "loss": 0.5281, + "step": 8589, + "task_loss": 1.501904845237732 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.171127200126648, + "epoch": 7.26, + "learning_rate": 1.521555367709214e-05, + "loss": 0.7384, + "step": 8590, + "task_loss": 0.9104582667350769 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5572950839996338, + "epoch": 7.26, + "learning_rate": 1.5210857518549826e-05, + "loss": 0.5954, + "step": 8591, + "task_loss": 0.7898914217948914 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.853752613067627, + "epoch": 7.26, + "learning_rate": 1.5206161360007516e-05, + "loss": 0.8039, + "step": 8592, + "task_loss": 0.5806027054786682 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4303239583969116, + "epoch": 7.26, + "learning_rate": 1.52014652014652e-05, + "loss": 0.5116, + "step": 8593, + "task_loss": 0.5097985863685608 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8605223894119263, + "epoch": 7.26, + "learning_rate": 1.519676904292289e-05, + "loss": 0.7475, + "step": 8594, + "task_loss": 1.351440668106079 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4471834599971771, + "epoch": 7.27, + "learning_rate": 1.5192072884380578e-05, + "loss": 0.6156, + "step": 8595, + "task_loss": 0.4556441307067871 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5589646100997925, + "epoch": 7.27, + "learning_rate": 1.5187376725838264e-05, + "loss": 0.6862, + "step": 8596, + "task_loss": 0.7318354845046997 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6227444410324097, + "epoch": 7.27, + "learning_rate": 1.5182680567295954e-05, + "loss": 0.8409, + "step": 8597, + "task_loss": 0.6903026700019836 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.47638800740242004, + "epoch": 7.27, + "learning_rate": 1.517798440875364e-05, + "loss": 0.527, + "step": 8598, + "task_loss": 0.16575603187084198 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6527043581008911, + "epoch": 7.27, + "learning_rate": 1.5173288250211328e-05, + "loss": 0.6512, + "step": 8599, + "task_loss": 0.6359402537345886 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4439966678619385, + "epoch": 7.27, + "learning_rate": 1.5168592091669015e-05, + "loss": 0.565, + "step": 8600, + "task_loss": 1.2435956001281738 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5545533895492554, + "epoch": 7.27, + "learning_rate": 1.5163895933126704e-05, + "loss": 0.5897, + "step": 8601, + "task_loss": 0.3241345286369324 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.42338550090789795, + "epoch": 7.27, + "learning_rate": 1.515919977458439e-05, + "loss": 0.5262, + "step": 8602, + "task_loss": 0.19649693369865417 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.1142032146453857, + "epoch": 7.27, + "learning_rate": 1.5154503616042079e-05, + "loss": 0.7495, + "step": 8603, + "task_loss": 1.2082784175872803 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7056745886802673, + "epoch": 7.27, + "learning_rate": 1.5149807457499765e-05, + "loss": 0.4778, + "step": 8604, + "task_loss": 0.7226399779319763 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9025708436965942, + "epoch": 7.27, + "learning_rate": 1.5145111298957453e-05, + "loss": 0.7743, + "step": 8605, + "task_loss": 1.637715220451355 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8044679164886475, + "epoch": 7.27, + "learning_rate": 1.514041514041514e-05, + "loss": 0.7045, + "step": 8606, + "task_loss": 0.9451128244400024 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.2821432948112488, + "epoch": 7.28, + "learning_rate": 1.513571898187283e-05, + "loss": 0.515, + "step": 8607, + "task_loss": 0.4512132704257965 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7404207587242126, + "epoch": 7.28, + "learning_rate": 1.5131022823330516e-05, + "loss": 0.6911, + "step": 8608, + "task_loss": 1.3049473762512207 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.41553324460983276, + "epoch": 7.28, + "learning_rate": 1.5126326664788204e-05, + "loss": 0.4665, + "step": 8609, + "task_loss": 0.5393701791763306 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7349962592124939, + "epoch": 7.28, + "learning_rate": 1.5121630506245893e-05, + "loss": 0.7182, + "step": 8610, + "task_loss": 0.6264594793319702 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.37236443161964417, + "epoch": 7.28, + "learning_rate": 1.511693434770358e-05, + "loss": 0.5368, + "step": 8611, + "task_loss": 0.4163951575756073 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4009607434272766, + "epoch": 7.28, + "learning_rate": 1.5112238189161268e-05, + "loss": 0.6087, + "step": 8612, + "task_loss": 0.13284240663051605 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5572596192359924, + "epoch": 7.28, + "learning_rate": 1.5107542030618954e-05, + "loss": 0.71, + "step": 8613, + "task_loss": 0.7044717073440552 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5306487083435059, + "epoch": 7.28, + "learning_rate": 1.5102845872076644e-05, + "loss": 0.5792, + "step": 8614, + "task_loss": 0.8213581442832947 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5952543020248413, + "epoch": 7.28, + "learning_rate": 1.5098149713534328e-05, + "loss": 0.7204, + "step": 8615, + "task_loss": 0.21191759407520294 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5639598965644836, + "epoch": 7.28, + "learning_rate": 1.5093453554992018e-05, + "loss": 0.6268, + "step": 8616, + "task_loss": 1.499599575996399 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.49424266815185547, + "epoch": 7.28, + "learning_rate": 1.5088757396449705e-05, + "loss": 0.696, + "step": 8617, + "task_loss": 0.501703143119812 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9524234533309937, + "epoch": 7.28, + "learning_rate": 1.5084061237907393e-05, + "loss": 0.6934, + "step": 8618, + "task_loss": 0.9482629895210266 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.30029505491256714, + "epoch": 7.29, + "learning_rate": 1.5079365079365079e-05, + "loss": 0.6299, + "step": 8619, + "task_loss": 0.14176981151103973 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.37973201274871826, + "epoch": 7.29, + "learning_rate": 1.5074668920822769e-05, + "loss": 0.4765, + "step": 8620, + "task_loss": 1.4427130222320557 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4664275348186493, + "epoch": 7.29, + "learning_rate": 1.5069972762280455e-05, + "loss": 0.6624, + "step": 8621, + "task_loss": 0.24563375115394592 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0374733209609985, + "epoch": 7.29, + "learning_rate": 1.5065276603738143e-05, + "loss": 0.7258, + "step": 8622, + "task_loss": 1.0693204402923584 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6193253397941589, + "epoch": 7.29, + "learning_rate": 1.506058044519583e-05, + "loss": 0.7572, + "step": 8623, + "task_loss": 1.529467225074768 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.042726755142212, + "epoch": 7.29, + "learning_rate": 1.5055884286653519e-05, + "loss": 0.7586, + "step": 8624, + "task_loss": 0.6614536643028259 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5109685659408569, + "epoch": 7.29, + "learning_rate": 1.5051188128111204e-05, + "loss": 0.8419, + "step": 8625, + "task_loss": 0.14167331159114838 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.46367791295051575, + "epoch": 7.29, + "learning_rate": 1.5046491969568893e-05, + "loss": 0.5989, + "step": 8626, + "task_loss": 0.38729405403137207 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0347099304199219, + "epoch": 7.29, + "learning_rate": 1.5041795811026581e-05, + "loss": 0.8293, + "step": 8627, + "task_loss": 0.9797528982162476 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.439480721950531, + "epoch": 7.29, + "learning_rate": 1.5037099652484268e-05, + "loss": 0.5909, + "step": 8628, + "task_loss": 0.19808103144168854 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5047893524169922, + "epoch": 7.29, + "learning_rate": 1.5032403493941958e-05, + "loss": 0.6835, + "step": 8629, + "task_loss": 1.209742546081543 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8834391832351685, + "epoch": 7.29, + "learning_rate": 1.5027707335399644e-05, + "loss": 0.7953, + "step": 8630, + "task_loss": 0.6462968587875366 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5895835757255554, + "epoch": 7.3, + "learning_rate": 1.5023011176857332e-05, + "loss": 0.5498, + "step": 8631, + "task_loss": 0.33146315813064575 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.544841468334198, + "epoch": 7.3, + "learning_rate": 1.5018315018315018e-05, + "loss": 0.5888, + "step": 8632, + "task_loss": 0.17890118062496185 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3741670250892639, + "epoch": 7.3, + "learning_rate": 1.5013618859772708e-05, + "loss": 0.4397, + "step": 8633, + "task_loss": 0.13691885769367218 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4873931407928467, + "epoch": 7.3, + "learning_rate": 1.5008922701230394e-05, + "loss": 0.6421, + "step": 8634, + "task_loss": 0.6182990074157715 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6301355361938477, + "epoch": 7.3, + "learning_rate": 1.5004226542688082e-05, + "loss": 0.6625, + "step": 8635, + "task_loss": 0.8585296273231506 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6403987407684326, + "epoch": 7.3, + "learning_rate": 1.4999530384145769e-05, + "loss": 0.6281, + "step": 8636, + "task_loss": 1.122861385345459 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5298886299133301, + "epoch": 7.3, + "learning_rate": 1.4994834225603457e-05, + "loss": 0.58, + "step": 8637, + "task_loss": 0.4831189215183258 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7167404890060425, + "epoch": 7.3, + "learning_rate": 1.4990138067061143e-05, + "loss": 0.4759, + "step": 8638, + "task_loss": 0.7935507297515869 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.49640846252441406, + "epoch": 7.3, + "learning_rate": 1.4985441908518833e-05, + "loss": 0.6088, + "step": 8639, + "task_loss": 0.11342264711856842 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6727743148803711, + "epoch": 7.3, + "learning_rate": 1.4980745749976519e-05, + "loss": 0.8105, + "step": 8640, + "task_loss": 1.0446207523345947 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7895699143409729, + "epoch": 7.3, + "learning_rate": 1.4976049591434207e-05, + "loss": 0.5556, + "step": 8641, + "task_loss": 0.6130087375640869 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4656570553779602, + "epoch": 7.3, + "learning_rate": 1.4971353432891897e-05, + "loss": 0.4416, + "step": 8642, + "task_loss": 0.5980230569839478 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8635374903678894, + "epoch": 7.31, + "learning_rate": 1.4966657274349583e-05, + "loss": 0.5072, + "step": 8643, + "task_loss": 0.4091757535934448 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6691185832023621, + "epoch": 7.31, + "learning_rate": 1.4961961115807271e-05, + "loss": 0.5451, + "step": 8644, + "task_loss": 0.31768468022346497 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7229069471359253, + "epoch": 7.31, + "learning_rate": 1.4957264957264958e-05, + "loss": 0.6373, + "step": 8645, + "task_loss": 0.43700647354125977 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6575817465782166, + "epoch": 7.31, + "learning_rate": 1.4952568798722647e-05, + "loss": 0.725, + "step": 8646, + "task_loss": 1.7849916219711304 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.628978967666626, + "epoch": 7.31, + "learning_rate": 1.4947872640180332e-05, + "loss": 0.6417, + "step": 8647, + "task_loss": 0.8761529326438904 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.48294228315353394, + "epoch": 7.31, + "learning_rate": 1.4943176481638022e-05, + "loss": 0.4889, + "step": 8648, + "task_loss": 0.41177865862846375 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6754922866821289, + "epoch": 7.31, + "learning_rate": 1.4938480323095708e-05, + "loss": 0.6276, + "step": 8649, + "task_loss": 1.0458240509033203 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.48102807998657227, + "epoch": 7.31, + "learning_rate": 1.4933784164553396e-05, + "loss": 0.5467, + "step": 8650, + "task_loss": 0.2876395285129547 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0646291971206665, + "epoch": 7.31, + "learning_rate": 1.4929088006011082e-05, + "loss": 0.9358, + "step": 8651, + "task_loss": 1.2112129926681519 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6319409608840942, + "epoch": 7.31, + "learning_rate": 1.4924391847468772e-05, + "loss": 0.6308, + "step": 8652, + "task_loss": 0.8647340536117554 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5363558530807495, + "epoch": 7.31, + "learning_rate": 1.4919695688926458e-05, + "loss": 0.5398, + "step": 8653, + "task_loss": 1.0107996463775635 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6766921281814575, + "epoch": 7.32, + "learning_rate": 1.4914999530384147e-05, + "loss": 0.5864, + "step": 8654, + "task_loss": 1.168833613395691 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8869651556015015, + "epoch": 7.32, + "learning_rate": 1.4910303371841833e-05, + "loss": 0.6275, + "step": 8655, + "task_loss": 1.202113389968872 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6148958206176758, + "epoch": 7.32, + "learning_rate": 1.4905607213299521e-05, + "loss": 0.7419, + "step": 8656, + "task_loss": 0.7117589712142944 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4083489775657654, + "epoch": 7.32, + "learning_rate": 1.490091105475721e-05, + "loss": 0.6641, + "step": 8657, + "task_loss": 0.12528233230113983 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4983125329017639, + "epoch": 7.32, + "learning_rate": 1.4896214896214897e-05, + "loss": 0.6684, + "step": 8658, + "task_loss": 0.960888683795929 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6405999660491943, + "epoch": 7.32, + "learning_rate": 1.4891518737672585e-05, + "loss": 0.8201, + "step": 8659, + "task_loss": 0.6992907524108887 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8046799898147583, + "epoch": 7.32, + "learning_rate": 1.4886822579130271e-05, + "loss": 0.7135, + "step": 8660, + "task_loss": 1.0677145719528198 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4784087538719177, + "epoch": 7.32, + "learning_rate": 1.4882126420587961e-05, + "loss": 0.556, + "step": 8661, + "task_loss": 0.9961147308349609 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8396099209785461, + "epoch": 7.32, + "learning_rate": 1.4877430262045647e-05, + "loss": 0.6074, + "step": 8662, + "task_loss": 0.41624683141708374 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3524929881095886, + "epoch": 7.32, + "learning_rate": 1.4872734103503335e-05, + "loss": 0.5099, + "step": 8663, + "task_loss": 0.8120133876800537 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5391747951507568, + "epoch": 7.32, + "learning_rate": 1.4868037944961022e-05, + "loss": 0.7506, + "step": 8664, + "task_loss": 0.5638376474380493 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8202834129333496, + "epoch": 7.32, + "learning_rate": 1.4863341786418711e-05, + "loss": 0.6622, + "step": 8665, + "task_loss": 1.4231985807418823 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.49967795610427856, + "epoch": 7.33, + "learning_rate": 1.4858645627876396e-05, + "loss": 0.5603, + "step": 8666, + "task_loss": 0.14163760840892792 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4648592174053192, + "epoch": 7.33, + "learning_rate": 1.4853949469334086e-05, + "loss": 0.5709, + "step": 8667, + "task_loss": 0.1473139524459839 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7350450754165649, + "epoch": 7.33, + "learning_rate": 1.4849253310791772e-05, + "loss": 0.8494, + "step": 8668, + "task_loss": 1.2249563932418823 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3016418218612671, + "epoch": 7.33, + "learning_rate": 1.484455715224946e-05, + "loss": 0.598, + "step": 8669, + "task_loss": 0.34194833040237427 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5355626344680786, + "epoch": 7.33, + "learning_rate": 1.4839860993707147e-05, + "loss": 0.561, + "step": 8670, + "task_loss": 0.8661082983016968 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6150917410850525, + "epoch": 7.33, + "learning_rate": 1.4835164835164836e-05, + "loss": 0.8662, + "step": 8671, + "task_loss": 1.6340181827545166 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.618216872215271, + "epoch": 7.33, + "learning_rate": 1.4830468676622524e-05, + "loss": 0.5475, + "step": 8672, + "task_loss": 0.31858983635902405 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7242258787155151, + "epoch": 7.33, + "learning_rate": 1.482577251808021e-05, + "loss": 0.6977, + "step": 8673, + "task_loss": 0.7841537594795227 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7661798000335693, + "epoch": 7.33, + "learning_rate": 1.48210763595379e-05, + "loss": 0.664, + "step": 8674, + "task_loss": 0.6289353370666504 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.29517796635627747, + "epoch": 7.33, + "learning_rate": 1.4816380200995587e-05, + "loss": 0.4745, + "step": 8675, + "task_loss": 0.857078492641449 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.825374960899353, + "epoch": 7.33, + "learning_rate": 1.4811684042453275e-05, + "loss": 0.7758, + "step": 8676, + "task_loss": 1.0586249828338623 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5238656401634216, + "epoch": 7.33, + "learning_rate": 1.4806987883910961e-05, + "loss": 0.6424, + "step": 8677, + "task_loss": 0.5459031462669373 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.49692875146865845, + "epoch": 7.34, + "learning_rate": 1.4802291725368649e-05, + "loss": 0.5896, + "step": 8678, + "task_loss": 0.8093302249908447 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6671885251998901, + "epoch": 7.34, + "learning_rate": 1.4797595566826335e-05, + "loss": 0.7515, + "step": 8679, + "task_loss": 1.4242732524871826 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3940938711166382, + "epoch": 7.34, + "learning_rate": 1.4792899408284025e-05, + "loss": 0.5529, + "step": 8680, + "task_loss": 0.3324246108531952 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5185546278953552, + "epoch": 7.34, + "learning_rate": 1.4788203249741712e-05, + "loss": 0.7947, + "step": 8681, + "task_loss": 1.0122621059417725 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.582156777381897, + "epoch": 7.34, + "learning_rate": 1.47835070911994e-05, + "loss": 0.6, + "step": 8682, + "task_loss": 1.037845492362976 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6790263652801514, + "epoch": 7.34, + "learning_rate": 1.4778810932657086e-05, + "loss": 0.6542, + "step": 8683, + "task_loss": 0.5317809581756592 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5058995485305786, + "epoch": 7.34, + "learning_rate": 1.4774114774114776e-05, + "loss": 0.5704, + "step": 8684, + "task_loss": 0.6986343264579773 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8495147228240967, + "epoch": 7.34, + "learning_rate": 1.4769418615572462e-05, + "loss": 0.7278, + "step": 8685, + "task_loss": 0.6923937797546387 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.45155662298202515, + "epoch": 7.34, + "learning_rate": 1.476472245703015e-05, + "loss": 0.5432, + "step": 8686, + "task_loss": 0.6137672066688538 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8339681625366211, + "epoch": 7.34, + "learning_rate": 1.476002629848784e-05, + "loss": 0.81, + "step": 8687, + "task_loss": 0.7972264289855957 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.782361626625061, + "epoch": 7.34, + "learning_rate": 1.4755330139945524e-05, + "loss": 0.6808, + "step": 8688, + "task_loss": 0.4697839617729187 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5311969518661499, + "epoch": 7.34, + "learning_rate": 1.4750633981403214e-05, + "loss": 0.6846, + "step": 8689, + "task_loss": 0.4134114384651184 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.3339974880218506, + "epoch": 7.35, + "learning_rate": 1.47459378228609e-05, + "loss": 0.7445, + "step": 8690, + "task_loss": 0.8545857071876526 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3742201626300812, + "epoch": 7.35, + "learning_rate": 1.4741241664318589e-05, + "loss": 0.6829, + "step": 8691, + "task_loss": 0.7112624049186707 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.37157976627349854, + "epoch": 7.35, + "learning_rate": 1.4736545505776275e-05, + "loss": 0.6935, + "step": 8692, + "task_loss": 0.3239036798477173 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.507222592830658, + "epoch": 7.35, + "learning_rate": 1.4731849347233965e-05, + "loss": 0.5901, + "step": 8693, + "task_loss": 0.17266567051410675 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.906891405582428, + "epoch": 7.35, + "learning_rate": 1.4727153188691651e-05, + "loss": 0.6972, + "step": 8694, + "task_loss": 1.2502264976501465 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3405589461326599, + "epoch": 7.35, + "learning_rate": 1.4722457030149339e-05, + "loss": 0.69, + "step": 8695, + "task_loss": 0.4206240475177765 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5094888210296631, + "epoch": 7.35, + "learning_rate": 1.4717760871607025e-05, + "loss": 0.6903, + "step": 8696, + "task_loss": 0.362775981426239 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7108370661735535, + "epoch": 7.35, + "learning_rate": 1.4713064713064715e-05, + "loss": 0.5955, + "step": 8697, + "task_loss": 1.44813871383667 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4439922869205475, + "epoch": 7.35, + "learning_rate": 1.47083685545224e-05, + "loss": 0.6319, + "step": 8698, + "task_loss": 0.9470319747924805 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0167666673660278, + "epoch": 7.35, + "learning_rate": 1.470367239598009e-05, + "loss": 0.8989, + "step": 8699, + "task_loss": 1.2659785747528076 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5396394729614258, + "epoch": 7.35, + "learning_rate": 1.4698976237437776e-05, + "loss": 0.553, + "step": 8700, + "task_loss": 0.7688786387443542 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6694654226303101, + "epoch": 7.35, + "learning_rate": 1.4694280078895464e-05, + "loss": 0.57, + "step": 8701, + "task_loss": 2.7519748210906982 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9801220893859863, + "epoch": 7.36, + "learning_rate": 1.468958392035315e-05, + "loss": 0.7152, + "step": 8702, + "task_loss": 0.6829496026039124 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5833518505096436, + "epoch": 7.36, + "learning_rate": 1.468488776181084e-05, + "loss": 0.6452, + "step": 8703, + "task_loss": 0.3410288691520691 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6381517648696899, + "epoch": 7.36, + "learning_rate": 1.4680191603268528e-05, + "loss": 0.7912, + "step": 8704, + "task_loss": 0.9382807612419128 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5460584163665771, + "epoch": 7.36, + "learning_rate": 1.4675495444726214e-05, + "loss": 0.5594, + "step": 8705, + "task_loss": 0.9401823878288269 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5684000849723816, + "epoch": 7.36, + "learning_rate": 1.4670799286183904e-05, + "loss": 0.6745, + "step": 8706, + "task_loss": 0.22732090950012207 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6048927307128906, + "epoch": 7.36, + "learning_rate": 1.466610312764159e-05, + "loss": 0.5237, + "step": 8707, + "task_loss": 0.6695435047149658 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6827454566955566, + "epoch": 7.36, + "learning_rate": 1.4661406969099278e-05, + "loss": 0.602, + "step": 8708, + "task_loss": 0.4847581386566162 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6723681092262268, + "epoch": 7.36, + "learning_rate": 1.4656710810556965e-05, + "loss": 0.6536, + "step": 8709, + "task_loss": 0.6679102778434753 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.45276665687561035, + "epoch": 7.36, + "learning_rate": 1.4652014652014653e-05, + "loss": 0.5876, + "step": 8710, + "task_loss": 0.27432307600975037 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.743088960647583, + "epoch": 7.36, + "learning_rate": 1.4647318493472339e-05, + "loss": 0.72, + "step": 8711, + "task_loss": 0.42950600385665894 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9692265391349792, + "epoch": 7.36, + "learning_rate": 1.4642622334930029e-05, + "loss": 0.7832, + "step": 8712, + "task_loss": 0.7369881868362427 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6633397936820984, + "epoch": 7.36, + "learning_rate": 1.4637926176387715e-05, + "loss": 0.6079, + "step": 8713, + "task_loss": 0.4508334696292877 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5879848599433899, + "epoch": 7.37, + "learning_rate": 1.4633230017845403e-05, + "loss": 0.5735, + "step": 8714, + "task_loss": 0.42097777128219604 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.39843183755874634, + "epoch": 7.37, + "learning_rate": 1.462853385930309e-05, + "loss": 0.7038, + "step": 8715, + "task_loss": 0.9105626940727234 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8467691540718079, + "epoch": 7.37, + "learning_rate": 1.462383770076078e-05, + "loss": 0.6148, + "step": 8716, + "task_loss": 0.4705960750579834 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7410545945167542, + "epoch": 7.37, + "learning_rate": 1.4619141542218464e-05, + "loss": 0.6942, + "step": 8717, + "task_loss": 0.15698537230491638 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0142741203308105, + "epoch": 7.37, + "learning_rate": 1.4614445383676154e-05, + "loss": 0.7765, + "step": 8718, + "task_loss": 0.34348371624946594 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4841160178184509, + "epoch": 7.37, + "learning_rate": 1.4609749225133843e-05, + "loss": 0.7112, + "step": 8719, + "task_loss": 0.2160560041666031 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8912450075149536, + "epoch": 7.37, + "learning_rate": 1.4605053066591528e-05, + "loss": 0.8233, + "step": 8720, + "task_loss": 0.47277340292930603 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.47758734226226807, + "epoch": 7.37, + "learning_rate": 1.4600356908049218e-05, + "loss": 0.6314, + "step": 8721, + "task_loss": 0.12993839383125305 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5585634708404541, + "epoch": 7.37, + "learning_rate": 1.4595660749506904e-05, + "loss": 0.6004, + "step": 8722, + "task_loss": 0.6484310030937195 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9542001485824585, + "epoch": 7.37, + "learning_rate": 1.4590964590964592e-05, + "loss": 0.7887, + "step": 8723, + "task_loss": 0.5278677344322205 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.45783936977386475, + "epoch": 7.37, + "learning_rate": 1.4586268432422278e-05, + "loss": 0.6729, + "step": 8724, + "task_loss": 0.5096408128738403 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.668376088142395, + "epoch": 7.38, + "learning_rate": 1.4581572273879968e-05, + "loss": 0.6763, + "step": 8725, + "task_loss": 0.4645616114139557 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3960376977920532, + "epoch": 7.38, + "learning_rate": 1.4576876115337654e-05, + "loss": 0.5561, + "step": 8726, + "task_loss": 0.8314023017883301 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8907775282859802, + "epoch": 7.38, + "learning_rate": 1.4572179956795342e-05, + "loss": 0.5824, + "step": 8727, + "task_loss": 0.41596946120262146 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5307607054710388, + "epoch": 7.38, + "learning_rate": 1.4567483798253029e-05, + "loss": 0.5946, + "step": 8728, + "task_loss": 1.236758828163147 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6529446244239807, + "epoch": 7.38, + "learning_rate": 1.4562787639710717e-05, + "loss": 0.5652, + "step": 8729, + "task_loss": 0.7934979796409607 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.45125722885131836, + "epoch": 7.38, + "learning_rate": 1.4558091481168403e-05, + "loss": 0.6542, + "step": 8730, + "task_loss": 0.8367568850517273 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5242650508880615, + "epoch": 7.38, + "learning_rate": 1.4553395322626093e-05, + "loss": 0.6832, + "step": 8731, + "task_loss": 1.2408488988876343 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5519081354141235, + "epoch": 7.38, + "learning_rate": 1.454869916408378e-05, + "loss": 0.6061, + "step": 8732, + "task_loss": 0.860744833946228 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8766235113143921, + "epoch": 7.38, + "learning_rate": 1.4544003005541467e-05, + "loss": 0.6759, + "step": 8733, + "task_loss": 1.1656720638275146 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5323986411094666, + "epoch": 7.38, + "learning_rate": 1.4539306846999157e-05, + "loss": 0.5581, + "step": 8734, + "task_loss": 0.4563811719417572 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7628233432769775, + "epoch": 7.38, + "learning_rate": 1.4534610688456843e-05, + "loss": 0.6233, + "step": 8735, + "task_loss": 1.8154648542404175 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.29266467690467834, + "epoch": 7.38, + "learning_rate": 1.4529914529914531e-05, + "loss": 0.7151, + "step": 8736, + "task_loss": 0.136705681681633 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.046168327331543, + "epoch": 7.39, + "learning_rate": 1.4525218371372218e-05, + "loss": 0.8254, + "step": 8737, + "task_loss": 0.8139264583587646 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4476660490036011, + "epoch": 7.39, + "learning_rate": 1.4520522212829907e-05, + "loss": 0.4797, + "step": 8738, + "task_loss": 0.4886324107646942 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.29297012090682983, + "epoch": 7.39, + "learning_rate": 1.4515826054287592e-05, + "loss": 0.5864, + "step": 8739, + "task_loss": 0.16907556354999542 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.37060993909835815, + "epoch": 7.39, + "learning_rate": 1.4511129895745282e-05, + "loss": 0.6409, + "step": 8740, + "task_loss": 0.5214897990226746 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5165072083473206, + "epoch": 7.39, + "learning_rate": 1.4506433737202968e-05, + "loss": 0.6421, + "step": 8741, + "task_loss": 0.28513848781585693 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8321191072463989, + "epoch": 7.39, + "learning_rate": 1.4501737578660656e-05, + "loss": 0.5909, + "step": 8742, + "task_loss": 0.9241749048233032 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7901775240898132, + "epoch": 7.39, + "learning_rate": 1.4497041420118343e-05, + "loss": 0.6769, + "step": 8743, + "task_loss": 1.4530508518218994 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4174685478210449, + "epoch": 7.39, + "learning_rate": 1.4492345261576032e-05, + "loss": 0.6259, + "step": 8744, + "task_loss": 0.7804873585700989 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7847021818161011, + "epoch": 7.39, + "learning_rate": 1.4487649103033719e-05, + "loss": 0.8942, + "step": 8745, + "task_loss": 1.1442476511001587 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6377036571502686, + "epoch": 7.39, + "learning_rate": 1.4482952944491407e-05, + "loss": 0.6128, + "step": 8746, + "task_loss": 0.3207526206970215 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7172498106956482, + "epoch": 7.39, + "learning_rate": 1.4478256785949093e-05, + "loss": 0.5288, + "step": 8747, + "task_loss": 0.7126907706260681 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4844219386577606, + "epoch": 7.39, + "learning_rate": 1.4473560627406783e-05, + "loss": 0.6121, + "step": 8748, + "task_loss": 1.3001459836959839 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7498569488525391, + "epoch": 7.4, + "learning_rate": 1.446886446886447e-05, + "loss": 0.712, + "step": 8749, + "task_loss": 0.9875186681747437 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.37392911314964294, + "epoch": 7.4, + "learning_rate": 1.4464168310322157e-05, + "loss": 0.5098, + "step": 8750, + "task_loss": 0.2572353482246399 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5764129161834717, + "epoch": 7.4, + "learning_rate": 1.4459472151779845e-05, + "loss": 0.5735, + "step": 8751, + "task_loss": 0.2921358644962311 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4462422728538513, + "epoch": 7.4, + "learning_rate": 1.4454775993237531e-05, + "loss": 0.6822, + "step": 8752, + "task_loss": 0.2888237237930298 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6485174894332886, + "epoch": 7.4, + "learning_rate": 1.4450079834695221e-05, + "loss": 0.6735, + "step": 8753, + "task_loss": 0.6915189027786255 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7523515224456787, + "epoch": 7.4, + "learning_rate": 1.4445383676152908e-05, + "loss": 0.5511, + "step": 8754, + "task_loss": 1.1371971368789673 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5083110332489014, + "epoch": 7.4, + "learning_rate": 1.4440687517610596e-05, + "loss": 0.4978, + "step": 8755, + "task_loss": 0.5730241537094116 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0605354309082031, + "epoch": 7.4, + "learning_rate": 1.4435991359068282e-05, + "loss": 0.7732, + "step": 8756, + "task_loss": 0.8700273036956787 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4830750823020935, + "epoch": 7.4, + "learning_rate": 1.4431295200525972e-05, + "loss": 0.4645, + "step": 8757, + "task_loss": 0.5995084643363953 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8360882997512817, + "epoch": 7.4, + "learning_rate": 1.4426599041983658e-05, + "loss": 0.7932, + "step": 8758, + "task_loss": 1.0043638944625854 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9077550172805786, + "epoch": 7.4, + "learning_rate": 1.4421902883441346e-05, + "loss": 0.7436, + "step": 8759, + "task_loss": 2.477295160293579 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7847498655319214, + "epoch": 7.4, + "learning_rate": 1.4417206724899032e-05, + "loss": 0.6052, + "step": 8760, + "task_loss": 1.7526910305023193 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6511313915252686, + "epoch": 7.41, + "learning_rate": 1.441251056635672e-05, + "loss": 0.5758, + "step": 8761, + "task_loss": 0.6303622126579285 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7562260627746582, + "epoch": 7.41, + "learning_rate": 1.4407814407814407e-05, + "loss": 0.6852, + "step": 8762, + "task_loss": 0.9909593462944031 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8733607530593872, + "epoch": 7.41, + "learning_rate": 1.4403118249272096e-05, + "loss": 0.6407, + "step": 8763, + "task_loss": 1.2774608135223389 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5715016722679138, + "epoch": 7.41, + "learning_rate": 1.4398422090729784e-05, + "loss": 0.6216, + "step": 8764, + "task_loss": 0.7382771968841553 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5520987510681152, + "epoch": 7.41, + "learning_rate": 1.439372593218747e-05, + "loss": 0.7611, + "step": 8765, + "task_loss": 0.5457879304885864 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6868373155593872, + "epoch": 7.41, + "learning_rate": 1.438902977364516e-05, + "loss": 0.6991, + "step": 8766, + "task_loss": 1.0754362344741821 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8684296011924744, + "epoch": 7.41, + "learning_rate": 1.4384333615102847e-05, + "loss": 0.6504, + "step": 8767, + "task_loss": 0.848599374294281 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.43506577610969543, + "epoch": 7.41, + "learning_rate": 1.4379637456560535e-05, + "loss": 0.5854, + "step": 8768, + "task_loss": 0.5774693489074707 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.40215206146240234, + "epoch": 7.41, + "learning_rate": 1.4374941298018221e-05, + "loss": 0.5609, + "step": 8769, + "task_loss": 0.2691816985607147 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6169037818908691, + "epoch": 7.41, + "learning_rate": 1.4370245139475911e-05, + "loss": 0.5367, + "step": 8770, + "task_loss": 0.3403697609901428 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4808915853500366, + "epoch": 7.41, + "learning_rate": 1.4365548980933596e-05, + "loss": 0.573, + "step": 8771, + "task_loss": 0.3571156859397888 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5896939039230347, + "epoch": 7.41, + "learning_rate": 1.4360852822391285e-05, + "loss": 0.5652, + "step": 8772, + "task_loss": 1.4553383588790894 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5954751968383789, + "epoch": 7.42, + "learning_rate": 1.4356156663848972e-05, + "loss": 0.6238, + "step": 8773, + "task_loss": 0.45196864008903503 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6677011251449585, + "epoch": 7.42, + "learning_rate": 1.435146050530666e-05, + "loss": 0.654, + "step": 8774, + "task_loss": 0.4764656722545624 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7266092300415039, + "epoch": 7.42, + "learning_rate": 1.4346764346764346e-05, + "loss": 0.826, + "step": 8775, + "task_loss": 0.4710925817489624 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4168340563774109, + "epoch": 7.42, + "learning_rate": 1.4342068188222036e-05, + "loss": 0.5403, + "step": 8776, + "task_loss": 0.8976253271102905 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3759346008300781, + "epoch": 7.42, + "learning_rate": 1.4337372029679722e-05, + "loss": 0.6317, + "step": 8777, + "task_loss": 0.8430954217910767 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9215548634529114, + "epoch": 7.42, + "learning_rate": 1.433267587113741e-05, + "loss": 0.8804, + "step": 8778, + "task_loss": 0.9477734565734863 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5528453588485718, + "epoch": 7.42, + "learning_rate": 1.4327979712595097e-05, + "loss": 0.5081, + "step": 8779, + "task_loss": 0.2547319531440735 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7810794115066528, + "epoch": 7.42, + "learning_rate": 1.4323283554052786e-05, + "loss": 0.6952, + "step": 8780, + "task_loss": 0.3269648551940918 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7016751170158386, + "epoch": 7.42, + "learning_rate": 1.4318587395510474e-05, + "loss": 0.6036, + "step": 8781, + "task_loss": 0.3167632818222046 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7900704741477966, + "epoch": 7.42, + "learning_rate": 1.431389123696816e-05, + "loss": 0.6307, + "step": 8782, + "task_loss": 0.7553130388259888 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.2663400173187256, + "epoch": 7.42, + "learning_rate": 1.4309195078425849e-05, + "loss": 0.6085, + "step": 8783, + "task_loss": 0.23223000764846802 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8492657542228699, + "epoch": 7.42, + "learning_rate": 1.4304498919883535e-05, + "loss": 0.6162, + "step": 8784, + "task_loss": 0.6101099252700806 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7760933041572571, + "epoch": 7.43, + "learning_rate": 1.4299802761341225e-05, + "loss": 0.7803, + "step": 8785, + "task_loss": 0.7223026156425476 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4555625915527344, + "epoch": 7.43, + "learning_rate": 1.4295106602798911e-05, + "loss": 0.5628, + "step": 8786, + "task_loss": 0.5610942244529724 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8137398958206177, + "epoch": 7.43, + "learning_rate": 1.4290410444256599e-05, + "loss": 0.6833, + "step": 8787, + "task_loss": 0.868008553981781 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.727192223072052, + "epoch": 7.43, + "learning_rate": 1.4285714285714285e-05, + "loss": 0.8524, + "step": 8788, + "task_loss": 0.36701327562332153 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5639445185661316, + "epoch": 7.43, + "learning_rate": 1.4281018127171975e-05, + "loss": 0.5052, + "step": 8789, + "task_loss": 0.680091917514801 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.38779574632644653, + "epoch": 7.43, + "learning_rate": 1.427632196862966e-05, + "loss": 0.5415, + "step": 8790, + "task_loss": 0.3456108570098877 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3138313293457031, + "epoch": 7.43, + "learning_rate": 1.427162581008735e-05, + "loss": 0.452, + "step": 8791, + "task_loss": 0.2778777778148651 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6917424201965332, + "epoch": 7.43, + "learning_rate": 1.4266929651545036e-05, + "loss": 0.6132, + "step": 8792, + "task_loss": 0.8920177817344666 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.49301934242248535, + "epoch": 7.43, + "learning_rate": 1.4262233493002724e-05, + "loss": 0.554, + "step": 8793, + "task_loss": 0.32925766706466675 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4216076135635376, + "epoch": 7.43, + "learning_rate": 1.425753733446041e-05, + "loss": 0.6, + "step": 8794, + "task_loss": 0.7472627758979797 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5538419485092163, + "epoch": 7.43, + "learning_rate": 1.42528411759181e-05, + "loss": 0.6034, + "step": 8795, + "task_loss": 0.8886017799377441 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5164063572883606, + "epoch": 7.44, + "learning_rate": 1.4248145017375788e-05, + "loss": 0.7603, + "step": 8796, + "task_loss": 0.19771885871887207 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9094198942184448, + "epoch": 7.44, + "learning_rate": 1.4243448858833474e-05, + "loss": 0.6542, + "step": 8797, + "task_loss": 0.3641730546951294 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.47626736760139465, + "epoch": 7.44, + "learning_rate": 1.4238752700291164e-05, + "loss": 0.7843, + "step": 8798, + "task_loss": 0.13868051767349243 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5356508493423462, + "epoch": 7.44, + "learning_rate": 1.423405654174885e-05, + "loss": 0.6581, + "step": 8799, + "task_loss": 0.8187415599822998 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.44745469093322754, + "epoch": 7.44, + "learning_rate": 1.4229360383206538e-05, + "loss": 0.4254, + "step": 8800, + "task_loss": 0.40487509965896606 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0848885774612427, + "epoch": 7.44, + "learning_rate": 1.4224664224664225e-05, + "loss": 1.003, + "step": 8801, + "task_loss": 1.5169591903686523 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6064279079437256, + "epoch": 7.44, + "learning_rate": 1.4219968066121915e-05, + "loss": 0.8561, + "step": 8802, + "task_loss": 1.417007565498352 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.48177289962768555, + "epoch": 7.44, + "learning_rate": 1.42152719075796e-05, + "loss": 0.47, + "step": 8803, + "task_loss": 0.5477072596549988 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9486595988273621, + "epoch": 7.44, + "learning_rate": 1.4210575749037289e-05, + "loss": 0.6318, + "step": 8804, + "task_loss": 0.5397034287452698 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3896545469760895, + "epoch": 7.44, + "learning_rate": 1.4205879590494975e-05, + "loss": 0.4676, + "step": 8805, + "task_loss": 0.38796597719192505 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.40605467557907104, + "epoch": 7.44, + "learning_rate": 1.4201183431952663e-05, + "loss": 0.6333, + "step": 8806, + "task_loss": 0.7369310259819031 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9974750280380249, + "epoch": 7.44, + "learning_rate": 1.419648727341035e-05, + "loss": 0.6219, + "step": 8807, + "task_loss": 0.6610552072525024 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5816104412078857, + "epoch": 7.45, + "learning_rate": 1.419179111486804e-05, + "loss": 0.8006, + "step": 8808, + "task_loss": 1.196359395980835 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7082867622375488, + "epoch": 7.45, + "learning_rate": 1.4187094956325726e-05, + "loss": 0.5379, + "step": 8809, + "task_loss": 0.7322938442230225 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.41693195700645447, + "epoch": 7.45, + "learning_rate": 1.4182398797783414e-05, + "loss": 0.7126, + "step": 8810, + "task_loss": 0.18205788731575012 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.518075704574585, + "epoch": 7.45, + "learning_rate": 1.4177702639241103e-05, + "loss": 0.5328, + "step": 8811, + "task_loss": 0.4997875988483429 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7040442228317261, + "epoch": 7.45, + "learning_rate": 1.4173006480698788e-05, + "loss": 0.6741, + "step": 8812, + "task_loss": 1.1575943231582642 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.935731053352356, + "epoch": 7.45, + "learning_rate": 1.4168310322156478e-05, + "loss": 0.586, + "step": 8813, + "task_loss": 1.2900288105010986 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9262793660163879, + "epoch": 7.45, + "learning_rate": 1.4163614163614164e-05, + "loss": 0.7475, + "step": 8814, + "task_loss": 1.557800531387329 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4070414900779724, + "epoch": 7.45, + "learning_rate": 1.4158918005071852e-05, + "loss": 0.6104, + "step": 8815, + "task_loss": 0.6680669188499451 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4281339943408966, + "epoch": 7.45, + "learning_rate": 1.4154221846529539e-05, + "loss": 0.5274, + "step": 8816, + "task_loss": 0.9679272174835205 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5258476734161377, + "epoch": 7.45, + "learning_rate": 1.4149525687987228e-05, + "loss": 0.5405, + "step": 8817, + "task_loss": 1.1053797006607056 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7854675650596619, + "epoch": 7.45, + "learning_rate": 1.4144829529444915e-05, + "loss": 0.7146, + "step": 8818, + "task_loss": 1.2905930280685425 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.63397216796875, + "epoch": 7.45, + "learning_rate": 1.4140133370902603e-05, + "loss": 0.7881, + "step": 8819, + "task_loss": 0.6018019318580627 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8031166791915894, + "epoch": 7.46, + "learning_rate": 1.4135437212360289e-05, + "loss": 0.6013, + "step": 8820, + "task_loss": 0.5283890962600708 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8007916212081909, + "epoch": 7.46, + "learning_rate": 1.4130741053817979e-05, + "loss": 0.6384, + "step": 8821, + "task_loss": 0.831751823425293 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0044329166412354, + "epoch": 7.46, + "learning_rate": 1.4126044895275663e-05, + "loss": 0.7521, + "step": 8822, + "task_loss": 0.4560522139072418 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.49193131923675537, + "epoch": 7.46, + "learning_rate": 1.4121348736733353e-05, + "loss": 0.5287, + "step": 8823, + "task_loss": 0.3974684178829193 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6580852270126343, + "epoch": 7.46, + "learning_rate": 1.411665257819104e-05, + "loss": 0.7038, + "step": 8824, + "task_loss": 1.2195870876312256 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.363609254360199, + "epoch": 7.46, + "learning_rate": 1.4111956419648727e-05, + "loss": 0.561, + "step": 8825, + "task_loss": 0.9209455847740173 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.49771973490715027, + "epoch": 7.46, + "learning_rate": 1.4107260261106417e-05, + "loss": 0.5071, + "step": 8826, + "task_loss": 0.5364199876785278 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9025495648384094, + "epoch": 7.46, + "learning_rate": 1.4102564102564104e-05, + "loss": 0.6297, + "step": 8827, + "task_loss": 1.0529654026031494 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4821113646030426, + "epoch": 7.46, + "learning_rate": 1.4097867944021792e-05, + "loss": 0.6007, + "step": 8828, + "task_loss": 0.16412353515625 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.3451638221740723, + "epoch": 7.46, + "learning_rate": 1.4093171785479478e-05, + "loss": 0.904, + "step": 8829, + "task_loss": 0.6956733465194702 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9514122605323792, + "epoch": 7.46, + "learning_rate": 1.4088475626937168e-05, + "loss": 0.6437, + "step": 8830, + "task_loss": 1.6201086044311523 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5856503844261169, + "epoch": 7.46, + "learning_rate": 1.4083779468394854e-05, + "loss": 0.6313, + "step": 8831, + "task_loss": 0.4601775109767914 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.21267268061637878, + "epoch": 7.47, + "learning_rate": 1.4079083309852542e-05, + "loss": 0.7022, + "step": 8832, + "task_loss": 0.14750169217586517 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0473490953445435, + "epoch": 7.47, + "learning_rate": 1.4074387151310228e-05, + "loss": 0.624, + "step": 8833, + "task_loss": 1.0580224990844727 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9063878059387207, + "epoch": 7.47, + "learning_rate": 1.4069690992767916e-05, + "loss": 0.7555, + "step": 8834, + "task_loss": 1.02269446849823 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.33772343397140503, + "epoch": 7.47, + "learning_rate": 1.4064994834225603e-05, + "loss": 0.4743, + "step": 8835, + "task_loss": 0.46164241433143616 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4827035367488861, + "epoch": 7.47, + "learning_rate": 1.4060298675683292e-05, + "loss": 0.6154, + "step": 8836, + "task_loss": 0.7680602669715881 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9013213515281677, + "epoch": 7.47, + "learning_rate": 1.4055602517140979e-05, + "loss": 0.7917, + "step": 8837, + "task_loss": 0.5436569452285767 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8341455459594727, + "epoch": 7.47, + "learning_rate": 1.4050906358598667e-05, + "loss": 0.6919, + "step": 8838, + "task_loss": 0.6588075757026672 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.762584924697876, + "epoch": 7.47, + "learning_rate": 1.4046210200056353e-05, + "loss": 0.5851, + "step": 8839, + "task_loss": 0.8509184122085571 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5847916603088379, + "epoch": 7.47, + "learning_rate": 1.4041514041514043e-05, + "loss": 0.4197, + "step": 8840, + "task_loss": 0.3632822036743164 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6044394969940186, + "epoch": 7.47, + "learning_rate": 1.4036817882971731e-05, + "loss": 0.6412, + "step": 8841, + "task_loss": 1.1279752254486084 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3632338345050812, + "epoch": 7.47, + "learning_rate": 1.4032121724429417e-05, + "loss": 0.5429, + "step": 8842, + "task_loss": 1.2087377309799194 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9052183032035828, + "epoch": 7.47, + "learning_rate": 1.4027425565887107e-05, + "loss": 0.9335, + "step": 8843, + "task_loss": 1.5793147087097168 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7720504999160767, + "epoch": 7.48, + "learning_rate": 1.4022729407344792e-05, + "loss": 0.7824, + "step": 8844, + "task_loss": 1.397289514541626 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6841272115707397, + "epoch": 7.48, + "learning_rate": 1.4018033248802481e-05, + "loss": 0.7192, + "step": 8845, + "task_loss": 0.8133702278137207 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.36515647172927856, + "epoch": 7.48, + "learning_rate": 1.4013337090260168e-05, + "loss": 0.7363, + "step": 8846, + "task_loss": 0.7815085649490356 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7764300107955933, + "epoch": 7.48, + "learning_rate": 1.4008640931717856e-05, + "loss": 0.7638, + "step": 8847, + "task_loss": 1.2551336288452148 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6647482514381409, + "epoch": 7.48, + "learning_rate": 1.4003944773175542e-05, + "loss": 0.9178, + "step": 8848, + "task_loss": 1.0449137687683105 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5100473761558533, + "epoch": 7.48, + "learning_rate": 1.3999248614633232e-05, + "loss": 0.7298, + "step": 8849, + "task_loss": 0.48996424674987793 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6519216299057007, + "epoch": 7.48, + "learning_rate": 1.3994552456090918e-05, + "loss": 0.5964, + "step": 8850, + "task_loss": 0.514051616191864 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6779094934463501, + "epoch": 7.48, + "learning_rate": 1.3989856297548606e-05, + "loss": 0.7047, + "step": 8851, + "task_loss": 0.98404860496521 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4576091170310974, + "epoch": 7.48, + "learning_rate": 1.3985160139006292e-05, + "loss": 0.5453, + "step": 8852, + "task_loss": 0.7895155549049377 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8270617723464966, + "epoch": 7.48, + "learning_rate": 1.3980463980463982e-05, + "loss": 0.6426, + "step": 8853, + "task_loss": 0.7324225902557373 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.28825750946998596, + "epoch": 7.48, + "learning_rate": 1.3975767821921667e-05, + "loss": 0.5616, + "step": 8854, + "task_loss": 0.4130653440952301 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5886930227279663, + "epoch": 7.48, + "learning_rate": 1.3971071663379357e-05, + "loss": 0.6386, + "step": 8855, + "task_loss": 0.7830871939659119 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.1322964429855347, + "epoch": 7.49, + "learning_rate": 1.3966375504837043e-05, + "loss": 0.6003, + "step": 8856, + "task_loss": 1.1609469652175903 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.41545674204826355, + "epoch": 7.49, + "learning_rate": 1.3961679346294731e-05, + "loss": 0.4415, + "step": 8857, + "task_loss": 0.5572225451469421 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.34669339656829834, + "epoch": 7.49, + "learning_rate": 1.395698318775242e-05, + "loss": 0.4847, + "step": 8858, + "task_loss": 0.36142638325691223 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5019984245300293, + "epoch": 7.49, + "learning_rate": 1.3952287029210107e-05, + "loss": 0.6625, + "step": 8859, + "task_loss": 0.243475079536438 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.39736732840538025, + "epoch": 7.49, + "learning_rate": 1.3947590870667795e-05, + "loss": 0.6771, + "step": 8860, + "task_loss": 0.7308594584465027 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.51688551902771, + "epoch": 7.49, + "learning_rate": 1.3942894712125481e-05, + "loss": 0.4559, + "step": 8861, + "task_loss": 0.6861639022827148 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6876169443130493, + "epoch": 7.49, + "learning_rate": 1.3938198553583171e-05, + "loss": 0.6824, + "step": 8862, + "task_loss": 0.7852447628974915 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.45001864433288574, + "epoch": 7.49, + "learning_rate": 1.3933502395040857e-05, + "loss": 0.6047, + "step": 8863, + "task_loss": 0.8527868986129761 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6730351448059082, + "epoch": 7.49, + "learning_rate": 1.3928806236498546e-05, + "loss": 0.5682, + "step": 8864, + "task_loss": 0.2160242199897766 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.36679577827453613, + "epoch": 7.49, + "learning_rate": 1.3924110077956232e-05, + "loss": 0.5995, + "step": 8865, + "task_loss": 0.11887159198522568 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5325392484664917, + "epoch": 7.49, + "learning_rate": 1.391941391941392e-05, + "loss": 0.6127, + "step": 8866, + "task_loss": 0.6124284863471985 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6941558122634888, + "epoch": 7.5, + "learning_rate": 1.3914717760871606e-05, + "loss": 0.6046, + "step": 8867, + "task_loss": 0.37787818908691406 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7647233009338379, + "epoch": 7.5, + "learning_rate": 1.3910021602329296e-05, + "loss": 0.606, + "step": 8868, + "task_loss": 1.3090510368347168 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4305768311023712, + "epoch": 7.5, + "learning_rate": 1.3905325443786982e-05, + "loss": 0.6691, + "step": 8869, + "task_loss": 0.17104634642601013 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.607215404510498, + "epoch": 7.5, + "learning_rate": 1.390062928524467e-05, + "loss": 0.6101, + "step": 8870, + "task_loss": 0.8759523034095764 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3336174488067627, + "epoch": 7.5, + "learning_rate": 1.3895933126702357e-05, + "loss": 0.4743, + "step": 8871, + "task_loss": 0.15742571651935577 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4073694944381714, + "epoch": 7.5, + "learning_rate": 1.3891236968160046e-05, + "loss": 0.4578, + "step": 8872, + "task_loss": 0.49473658204078674 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.44035202264785767, + "epoch": 7.5, + "learning_rate": 1.3886540809617734e-05, + "loss": 0.6142, + "step": 8873, + "task_loss": 0.8415103554725647 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7873488068580627, + "epoch": 7.5, + "learning_rate": 1.388184465107542e-05, + "loss": 0.7357, + "step": 8874, + "task_loss": 1.210612177848816 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.41283154487609863, + "epoch": 7.5, + "learning_rate": 1.387714849253311e-05, + "loss": 0.5424, + "step": 8875, + "task_loss": 0.20983490347862244 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4437851905822754, + "epoch": 7.5, + "learning_rate": 1.3872452333990795e-05, + "loss": 0.6135, + "step": 8876, + "task_loss": 0.31561535596847534 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7585561871528625, + "epoch": 7.5, + "learning_rate": 1.3867756175448485e-05, + "loss": 0.7301, + "step": 8877, + "task_loss": 0.78935706615448 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8111639022827148, + "epoch": 7.5, + "learning_rate": 1.3863060016906171e-05, + "loss": 0.6319, + "step": 8878, + "task_loss": 0.288131445646286 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6545495986938477, + "epoch": 7.51, + "learning_rate": 1.385836385836386e-05, + "loss": 0.6141, + "step": 8879, + "task_loss": 0.19642230868339539 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3311038613319397, + "epoch": 7.51, + "learning_rate": 1.3853667699821546e-05, + "loss": 0.5377, + "step": 8880, + "task_loss": 0.7662826776504517 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6641418933868408, + "epoch": 7.51, + "learning_rate": 1.3848971541279235e-05, + "loss": 0.4628, + "step": 8881, + "task_loss": 0.8792110681533813 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6287149786949158, + "epoch": 7.51, + "learning_rate": 1.3844275382736922e-05, + "loss": 1.0357, + "step": 8882, + "task_loss": 0.4099615216255188 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8372287750244141, + "epoch": 7.51, + "learning_rate": 1.383957922419461e-05, + "loss": 0.5975, + "step": 8883, + "task_loss": 1.183732509613037 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9073964357376099, + "epoch": 7.51, + "learning_rate": 1.3834883065652296e-05, + "loss": 0.6814, + "step": 8884, + "task_loss": 1.5141808986663818 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5084950923919678, + "epoch": 7.51, + "learning_rate": 1.3830186907109984e-05, + "loss": 0.4327, + "step": 8885, + "task_loss": 0.7032251358032227 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.33746883273124695, + "epoch": 7.51, + "learning_rate": 1.382549074856767e-05, + "loss": 0.7159, + "step": 8886, + "task_loss": 1.1070866584777832 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7314506769180298, + "epoch": 7.51, + "learning_rate": 1.382079459002536e-05, + "loss": 0.7526, + "step": 8887, + "task_loss": 0.32172614336013794 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6887412071228027, + "epoch": 7.51, + "learning_rate": 1.3816098431483048e-05, + "loss": 0.5565, + "step": 8888, + "task_loss": 0.9023212790489197 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.26296737790107727, + "epoch": 7.51, + "learning_rate": 1.3811402272940734e-05, + "loss": 0.5218, + "step": 8889, + "task_loss": 0.22079001367092133 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7881039381027222, + "epoch": 7.51, + "learning_rate": 1.3806706114398424e-05, + "loss": 0.5847, + "step": 8890, + "task_loss": 0.5720228552818298 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3764512240886688, + "epoch": 7.52, + "learning_rate": 1.380200995585611e-05, + "loss": 0.523, + "step": 8891, + "task_loss": 0.30603504180908203 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5162084102630615, + "epoch": 7.52, + "learning_rate": 1.3797313797313799e-05, + "loss": 0.6025, + "step": 8892, + "task_loss": 0.29257574677467346 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6364763975143433, + "epoch": 7.52, + "learning_rate": 1.3792617638771485e-05, + "loss": 0.6658, + "step": 8893, + "task_loss": 0.8498486876487732 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6339189410209656, + "epoch": 7.52, + "learning_rate": 1.3787921480229175e-05, + "loss": 0.5087, + "step": 8894, + "task_loss": 0.8223717212677002 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6795960068702698, + "epoch": 7.52, + "learning_rate": 1.378322532168686e-05, + "loss": 0.9069, + "step": 8895, + "task_loss": 0.7811558842658997 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9231357574462891, + "epoch": 7.52, + "learning_rate": 1.3778529163144549e-05, + "loss": 0.674, + "step": 8896, + "task_loss": 0.8680524826049805 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7075031995773315, + "epoch": 7.52, + "learning_rate": 1.3773833004602235e-05, + "loss": 0.5444, + "step": 8897, + "task_loss": 0.6901678442955017 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5338830351829529, + "epoch": 7.52, + "learning_rate": 1.3769136846059923e-05, + "loss": 0.4502, + "step": 8898, + "task_loss": 1.1126830577850342 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.542309045791626, + "epoch": 7.52, + "learning_rate": 1.376444068751761e-05, + "loss": 0.6814, + "step": 8899, + "task_loss": 1.0683743953704834 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.43895402550697327, + "epoch": 7.52, + "learning_rate": 1.37597445289753e-05, + "loss": 0.4483, + "step": 8900, + "task_loss": 0.9535865783691406 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7284560203552246, + "epoch": 7.52, + "learning_rate": 1.3755048370432986e-05, + "loss": 0.5832, + "step": 8901, + "task_loss": 0.5317504405975342 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4973980486392975, + "epoch": 7.52, + "learning_rate": 1.3750352211890674e-05, + "loss": 0.4984, + "step": 8902, + "task_loss": 0.7434775829315186 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5061662197113037, + "epoch": 7.53, + "learning_rate": 1.3745656053348364e-05, + "loss": 0.5925, + "step": 8903, + "task_loss": 0.7438743114471436 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.25885939598083496, + "epoch": 7.53, + "learning_rate": 1.374095989480605e-05, + "loss": 0.5416, + "step": 8904, + "task_loss": 0.4865226149559021 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4541357457637787, + "epoch": 7.53, + "learning_rate": 1.3736263736263738e-05, + "loss": 0.669, + "step": 8905, + "task_loss": 0.3624517023563385 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4954375624656677, + "epoch": 7.53, + "learning_rate": 1.3731567577721424e-05, + "loss": 0.6767, + "step": 8906, + "task_loss": 0.35997769236564636 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5830038189888, + "epoch": 7.53, + "learning_rate": 1.3726871419179112e-05, + "loss": 0.6085, + "step": 8907, + "task_loss": 0.9088413119316101 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.31482070684432983, + "epoch": 7.53, + "learning_rate": 1.3722175260636799e-05, + "loss": 0.6398, + "step": 8908, + "task_loss": 0.22740982472896576 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.26539477705955505, + "epoch": 7.53, + "learning_rate": 1.3717479102094488e-05, + "loss": 0.5436, + "step": 8909, + "task_loss": 0.3458150327205658 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3648606538772583, + "epoch": 7.53, + "learning_rate": 1.3712782943552175e-05, + "loss": 0.5971, + "step": 8910, + "task_loss": 0.35045376420021057 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5343583226203918, + "epoch": 7.53, + "learning_rate": 1.3708086785009863e-05, + "loss": 0.4939, + "step": 8911, + "task_loss": 0.8208145499229431 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9925504326820374, + "epoch": 7.53, + "learning_rate": 1.3703390626467549e-05, + "loss": 0.6825, + "step": 8912, + "task_loss": 0.9285597801208496 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.40402334928512573, + "epoch": 7.53, + "learning_rate": 1.3698694467925239e-05, + "loss": 0.5284, + "step": 8913, + "task_loss": 1.4407533407211304 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5207595825195312, + "epoch": 7.53, + "learning_rate": 1.3693998309382925e-05, + "loss": 0.7412, + "step": 8914, + "task_loss": 0.3618236184120178 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4540119171142578, + "epoch": 7.54, + "learning_rate": 1.3689302150840613e-05, + "loss": 0.6699, + "step": 8915, + "task_loss": 1.0704293251037598 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5440312623977661, + "epoch": 7.54, + "learning_rate": 1.36846059922983e-05, + "loss": 0.4314, + "step": 8916, + "task_loss": 0.6740189790725708 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7333455085754395, + "epoch": 7.54, + "learning_rate": 1.3679909833755988e-05, + "loss": 0.6413, + "step": 8917, + "task_loss": 0.9622343182563782 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6980935335159302, + "epoch": 7.54, + "learning_rate": 1.3675213675213677e-05, + "loss": 0.7288, + "step": 8918, + "task_loss": 0.783235490322113 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5580082535743713, + "epoch": 7.54, + "learning_rate": 1.3670517516671364e-05, + "loss": 0.7308, + "step": 8919, + "task_loss": 0.7305170297622681 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8420637249946594, + "epoch": 7.54, + "learning_rate": 1.3665821358129052e-05, + "loss": 0.7902, + "step": 8920, + "task_loss": 1.3712555170059204 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6346033811569214, + "epoch": 7.54, + "learning_rate": 1.3661125199586738e-05, + "loss": 0.9368, + "step": 8921, + "task_loss": 1.1724592447280884 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5805145502090454, + "epoch": 7.54, + "learning_rate": 1.3656429041044428e-05, + "loss": 0.4025, + "step": 8922, + "task_loss": 0.9596258401870728 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7997813820838928, + "epoch": 7.54, + "learning_rate": 1.3651732882502114e-05, + "loss": 0.6564, + "step": 8923, + "task_loss": 0.6341410279273987 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4153688848018646, + "epoch": 7.54, + "learning_rate": 1.3647036723959802e-05, + "loss": 0.5531, + "step": 8924, + "task_loss": 0.4851565659046173 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8263653516769409, + "epoch": 7.54, + "learning_rate": 1.3642340565417488e-05, + "loss": 0.6389, + "step": 8925, + "task_loss": 1.1839333772659302 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6100901365280151, + "epoch": 7.54, + "learning_rate": 1.3637644406875178e-05, + "loss": 0.5973, + "step": 8926, + "task_loss": 0.760794997215271 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7172326445579529, + "epoch": 7.55, + "learning_rate": 1.3632948248332863e-05, + "loss": 0.7104, + "step": 8927, + "task_loss": 0.5268847942352295 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6305948495864868, + "epoch": 7.55, + "learning_rate": 1.3628252089790553e-05, + "loss": 0.6811, + "step": 8928, + "task_loss": 0.8348854780197144 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6487269401550293, + "epoch": 7.55, + "learning_rate": 1.3623555931248239e-05, + "loss": 0.6046, + "step": 8929, + "task_loss": 0.664027214050293 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7526897192001343, + "epoch": 7.55, + "learning_rate": 1.3618859772705927e-05, + "loss": 0.7821, + "step": 8930, + "task_loss": 1.2933772802352905 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.42170771956443787, + "epoch": 7.55, + "learning_rate": 1.3614163614163613e-05, + "loss": 0.5188, + "step": 8931, + "task_loss": 0.4956776797771454 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7291697263717651, + "epoch": 7.55, + "learning_rate": 1.3609467455621303e-05, + "loss": 0.707, + "step": 8932, + "task_loss": 0.6128455996513367 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5117077827453613, + "epoch": 7.55, + "learning_rate": 1.360477129707899e-05, + "loss": 0.5902, + "step": 8933, + "task_loss": 0.7781287431716919 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6265698671340942, + "epoch": 7.55, + "learning_rate": 1.3600075138536677e-05, + "loss": 0.5352, + "step": 8934, + "task_loss": 0.6233731508255005 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7455462217330933, + "epoch": 7.55, + "learning_rate": 1.3595378979994367e-05, + "loss": 0.5942, + "step": 8935, + "task_loss": 0.9626688361167908 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5240319967269897, + "epoch": 7.55, + "learning_rate": 1.3590682821452053e-05, + "loss": 0.6312, + "step": 8936, + "task_loss": 0.30184900760650635 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6667303442955017, + "epoch": 7.55, + "learning_rate": 1.3585986662909741e-05, + "loss": 0.5168, + "step": 8937, + "task_loss": 0.342134028673172 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8145533800125122, + "epoch": 7.56, + "learning_rate": 1.3581290504367428e-05, + "loss": 0.6233, + "step": 8938, + "task_loss": 0.7255985736846924 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5288999080657959, + "epoch": 7.56, + "learning_rate": 1.3576594345825116e-05, + "loss": 0.7124, + "step": 8939, + "task_loss": 0.4955522119998932 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.49799031019210815, + "epoch": 7.56, + "learning_rate": 1.3571898187282802e-05, + "loss": 0.6238, + "step": 8940, + "task_loss": 0.6523423194885254 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4156922399997711, + "epoch": 7.56, + "learning_rate": 1.3567202028740492e-05, + "loss": 0.4967, + "step": 8941, + "task_loss": 0.5171871185302734 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4986758530139923, + "epoch": 7.56, + "learning_rate": 1.3562505870198178e-05, + "loss": 0.5186, + "step": 8942, + "task_loss": 0.8211464285850525 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.584071159362793, + "epoch": 7.56, + "learning_rate": 1.3557809711655866e-05, + "loss": 0.5939, + "step": 8943, + "task_loss": 0.7096570730209351 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7129727602005005, + "epoch": 7.56, + "learning_rate": 1.3553113553113553e-05, + "loss": 0.505, + "step": 8944, + "task_loss": 1.032939076423645 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5510331988334656, + "epoch": 7.56, + "learning_rate": 1.3548417394571242e-05, + "loss": 0.6564, + "step": 8945, + "task_loss": 0.568354606628418 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0309984683990479, + "epoch": 7.56, + "learning_rate": 1.3543721236028927e-05, + "loss": 0.6858, + "step": 8946, + "task_loss": 1.9083861112594604 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5534846782684326, + "epoch": 7.56, + "learning_rate": 1.3539025077486617e-05, + "loss": 0.7029, + "step": 8947, + "task_loss": 0.3806469440460205 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5350360870361328, + "epoch": 7.56, + "learning_rate": 1.3534328918944303e-05, + "loss": 0.6174, + "step": 8948, + "task_loss": 0.5583010911941528 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.47263404726982117, + "epoch": 7.56, + "learning_rate": 1.3529632760401991e-05, + "loss": 0.5574, + "step": 8949, + "task_loss": 0.19095216691493988 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5050381422042847, + "epoch": 7.57, + "learning_rate": 1.352493660185968e-05, + "loss": 0.6579, + "step": 8950, + "task_loss": 1.0900251865386963 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7350224256515503, + "epoch": 7.57, + "learning_rate": 1.3520240443317367e-05, + "loss": 0.7378, + "step": 8951, + "task_loss": 1.9819756746292114 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.35176029801368713, + "epoch": 7.57, + "learning_rate": 1.3515544284775055e-05, + "loss": 0.5924, + "step": 8952, + "task_loss": 0.9507119059562683 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9072763919830322, + "epoch": 7.57, + "learning_rate": 1.3510848126232742e-05, + "loss": 0.5656, + "step": 8953, + "task_loss": 0.7344108819961548 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5861798524856567, + "epoch": 7.57, + "learning_rate": 1.3506151967690431e-05, + "loss": 0.5776, + "step": 8954, + "task_loss": 1.2045481204986572 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4218946397304535, + "epoch": 7.57, + "learning_rate": 1.3501455809148118e-05, + "loss": 0.5492, + "step": 8955, + "task_loss": 1.098868489265442 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6271092891693115, + "epoch": 7.57, + "learning_rate": 1.3496759650605806e-05, + "loss": 0.5885, + "step": 8956, + "task_loss": 0.7869184613227844 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9844666719436646, + "epoch": 7.57, + "learning_rate": 1.3492063492063492e-05, + "loss": 0.6404, + "step": 8957, + "task_loss": 0.669703483581543 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.20546510815620422, + "epoch": 7.57, + "learning_rate": 1.3487367333521182e-05, + "loss": 0.5522, + "step": 8958, + "task_loss": 0.43966707587242126 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.46099865436553955, + "epoch": 7.57, + "learning_rate": 1.3482671174978866e-05, + "loss": 0.5629, + "step": 8959, + "task_loss": 0.4389518201351166 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7387072443962097, + "epoch": 7.57, + "learning_rate": 1.3477975016436556e-05, + "loss": 0.6891, + "step": 8960, + "task_loss": 1.1003303527832031 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7470431327819824, + "epoch": 7.57, + "learning_rate": 1.3473278857894242e-05, + "loss": 0.7838, + "step": 8961, + "task_loss": 0.5220003724098206 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.25619348883628845, + "epoch": 7.58, + "learning_rate": 1.346858269935193e-05, + "loss": 0.5534, + "step": 8962, + "task_loss": 0.4513145387172699 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9220412373542786, + "epoch": 7.58, + "learning_rate": 1.3463886540809617e-05, + "loss": 0.6621, + "step": 8963, + "task_loss": 1.2315433025360107 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6066278219223022, + "epoch": 7.58, + "learning_rate": 1.3459190382267307e-05, + "loss": 0.5635, + "step": 8964, + "task_loss": 0.4779108166694641 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7027988433837891, + "epoch": 7.58, + "learning_rate": 1.3454494223724995e-05, + "loss": 0.5846, + "step": 8965, + "task_loss": 0.4471530616283417 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5344998240470886, + "epoch": 7.58, + "learning_rate": 1.3449798065182681e-05, + "loss": 0.4655, + "step": 8966, + "task_loss": 0.6914742588996887 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6566864252090454, + "epoch": 7.58, + "learning_rate": 1.344510190664037e-05, + "loss": 0.5817, + "step": 8967, + "task_loss": 0.768980085849762 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.417716383934021, + "epoch": 7.58, + "learning_rate": 1.3440405748098055e-05, + "loss": 0.8569, + "step": 8968, + "task_loss": 1.143964171409607 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5764356255531311, + "epoch": 7.58, + "learning_rate": 1.3435709589555745e-05, + "loss": 0.7695, + "step": 8969, + "task_loss": 0.27632227540016174 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4697914719581604, + "epoch": 7.58, + "learning_rate": 1.3431013431013431e-05, + "loss": 0.6726, + "step": 8970, + "task_loss": 0.6204628944396973 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.2383044958114624, + "epoch": 7.58, + "learning_rate": 1.342631727247112e-05, + "loss": 0.5566, + "step": 8971, + "task_loss": 0.012404450215399265 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4473098814487457, + "epoch": 7.58, + "learning_rate": 1.3421621113928806e-05, + "loss": 0.5913, + "step": 8972, + "task_loss": 0.6144418120384216 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5039156079292297, + "epoch": 7.58, + "learning_rate": 1.3416924955386495e-05, + "loss": 0.5507, + "step": 8973, + "task_loss": 0.5403667092323303 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.43870508670806885, + "epoch": 7.59, + "learning_rate": 1.3412228796844182e-05, + "loss": 0.5339, + "step": 8974, + "task_loss": 0.17271222174167633 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.3637676239013672, + "epoch": 7.59, + "learning_rate": 1.340753263830187e-05, + "loss": 0.8828, + "step": 8975, + "task_loss": 0.5955216884613037 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7214004397392273, + "epoch": 7.59, + "learning_rate": 1.3402836479759556e-05, + "loss": 0.7546, + "step": 8976, + "task_loss": 0.924312949180603 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.2916887402534485, + "epoch": 7.59, + "learning_rate": 1.3398140321217246e-05, + "loss": 0.648, + "step": 8977, + "task_loss": 0.41269224882125854 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7409454584121704, + "epoch": 7.59, + "learning_rate": 1.339344416267493e-05, + "loss": 0.7126, + "step": 8978, + "task_loss": 1.1775206327438354 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.47395795583724976, + "epoch": 7.59, + "learning_rate": 1.338874800413262e-05, + "loss": 0.7015, + "step": 8979, + "task_loss": 0.6879599094390869 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6251181364059448, + "epoch": 7.59, + "learning_rate": 1.3384051845590308e-05, + "loss": 0.539, + "step": 8980, + "task_loss": 0.1875106394290924 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3682768642902374, + "epoch": 7.59, + "learning_rate": 1.3379355687047995e-05, + "loss": 0.4235, + "step": 8981, + "task_loss": 0.4374229311943054 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5918357372283936, + "epoch": 7.59, + "learning_rate": 1.3374659528505684e-05, + "loss": 0.7663, + "step": 8982, + "task_loss": 0.6424500942230225 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7029181718826294, + "epoch": 7.59, + "learning_rate": 1.336996336996337e-05, + "loss": 0.4818, + "step": 8983, + "task_loss": 0.41917601227760315 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6133747696876526, + "epoch": 7.59, + "learning_rate": 1.3365267211421059e-05, + "loss": 0.6514, + "step": 8984, + "task_loss": 1.036465048789978 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4258459806442261, + "epoch": 7.59, + "learning_rate": 1.3360571052878745e-05, + "loss": 0.4757, + "step": 8985, + "task_loss": 0.23279881477355957 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6525278687477112, + "epoch": 7.6, + "learning_rate": 1.3355874894336435e-05, + "loss": 0.6549, + "step": 8986, + "task_loss": 0.3322206139564514 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4371565878391266, + "epoch": 7.6, + "learning_rate": 1.3351178735794121e-05, + "loss": 0.5731, + "step": 8987, + "task_loss": 0.267018586397171 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.37684255838394165, + "epoch": 7.6, + "learning_rate": 1.334648257725181e-05, + "loss": 0.6312, + "step": 8988, + "task_loss": 0.6275477409362793 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6850274801254272, + "epoch": 7.6, + "learning_rate": 1.3341786418709496e-05, + "loss": 0.6525, + "step": 8989, + "task_loss": 0.6877949833869934 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6953070163726807, + "epoch": 7.6, + "learning_rate": 1.3337090260167184e-05, + "loss": 0.6114, + "step": 8990, + "task_loss": 0.6951400637626648 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6871379613876343, + "epoch": 7.6, + "learning_rate": 1.333239410162487e-05, + "loss": 0.6864, + "step": 8991, + "task_loss": 1.1365954875946045 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.649625837802887, + "epoch": 7.6, + "learning_rate": 1.332769794308256e-05, + "loss": 0.4748, + "step": 8992, + "task_loss": 0.1356354057788849 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5677217245101929, + "epoch": 7.6, + "learning_rate": 1.3323001784540246e-05, + "loss": 0.7222, + "step": 8993, + "task_loss": 0.6196795701980591 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8794514536857605, + "epoch": 7.6, + "learning_rate": 1.3318305625997934e-05, + "loss": 0.6241, + "step": 8994, + "task_loss": 0.9544141888618469 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.2140377759933472, + "epoch": 7.6, + "learning_rate": 1.3313609467455624e-05, + "loss": 0.6514, + "step": 8995, + "task_loss": 1.1253108978271484 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.22128227353096008, + "epoch": 7.6, + "learning_rate": 1.330891330891331e-05, + "loss": 0.4357, + "step": 8996, + "task_loss": 0.45132794976234436 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5807039141654968, + "epoch": 7.6, + "learning_rate": 1.3304217150370998e-05, + "loss": 0.6397, + "step": 8997, + "task_loss": 0.49469056725502014 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5622422695159912, + "epoch": 7.61, + "learning_rate": 1.3299520991828684e-05, + "loss": 0.526, + "step": 8998, + "task_loss": 0.6068680882453918 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3791293799877167, + "epoch": 7.61, + "learning_rate": 1.3294824833286374e-05, + "loss": 0.5031, + "step": 8999, + "task_loss": 0.44225290417671204 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.2813960313796997, + "epoch": 7.61, + "learning_rate": 1.3290128674744059e-05, + "loss": 0.496, + "step": 9000, + "task_loss": 0.29683274030685425 + }, + { + "epoch": 7.61, + "eval_accuracy": 0.8981386138613862, + "eval_loss": 0.41152384877204895, + "eval_runtime": 225.3134, + "eval_samples_per_second": 112.066, + "eval_steps_per_second": 0.879, + "step": 9000 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7307490110397339, + "epoch": 7.61, + "learning_rate": 1.3285432516201749e-05, + "loss": 0.6148, + "step": 9001, + "task_loss": 0.966532289981842 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7366096377372742, + "epoch": 7.61, + "learning_rate": 1.3280736357659435e-05, + "loss": 0.5278, + "step": 9002, + "task_loss": 0.47506794333457947 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.533441960811615, + "epoch": 7.61, + "learning_rate": 1.3276040199117123e-05, + "loss": 0.7801, + "step": 9003, + "task_loss": 0.9930967688560486 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7346415519714355, + "epoch": 7.61, + "learning_rate": 1.327134404057481e-05, + "loss": 0.7007, + "step": 9004, + "task_loss": 0.7331096529960632 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4591485261917114, + "epoch": 7.61, + "learning_rate": 1.3266647882032499e-05, + "loss": 0.578, + "step": 9005, + "task_loss": 0.13209810853004456 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7664841413497925, + "epoch": 7.61, + "learning_rate": 1.3261951723490185e-05, + "loss": 0.8004, + "step": 9006, + "task_loss": 0.4957949221134186 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.3065317869186401, + "epoch": 7.61, + "learning_rate": 1.3257255564947873e-05, + "loss": 1.052, + "step": 9007, + "task_loss": 1.2621599435806274 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.36970430612564087, + "epoch": 7.61, + "learning_rate": 1.325255940640556e-05, + "loss": 0.4144, + "step": 9008, + "task_loss": 0.5109555125236511 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5257435441017151, + "epoch": 7.61, + "learning_rate": 1.324786324786325e-05, + "loss": 0.6924, + "step": 9009, + "task_loss": 0.7217450141906738 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.47787120938301086, + "epoch": 7.62, + "learning_rate": 1.3243167089320934e-05, + "loss": 0.5521, + "step": 9010, + "task_loss": 0.8045516014099121 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.44429051876068115, + "epoch": 7.62, + "learning_rate": 1.3238470930778624e-05, + "loss": 0.5788, + "step": 9011, + "task_loss": 1.1146248579025269 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7454016208648682, + "epoch": 7.62, + "learning_rate": 1.3233774772236312e-05, + "loss": 0.7014, + "step": 9012, + "task_loss": 1.4478458166122437 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7985577583312988, + "epoch": 7.62, + "learning_rate": 1.3229078613693998e-05, + "loss": 0.8142, + "step": 9013, + "task_loss": 1.3632134199142456 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7632204294204712, + "epoch": 7.62, + "learning_rate": 1.3224382455151688e-05, + "loss": 0.6445, + "step": 9014, + "task_loss": 0.5585616230964661 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7599582672119141, + "epoch": 7.62, + "learning_rate": 1.3219686296609374e-05, + "loss": 0.6155, + "step": 9015, + "task_loss": 1.4278753995895386 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6794182062149048, + "epoch": 7.62, + "learning_rate": 1.3214990138067062e-05, + "loss": 0.5356, + "step": 9016, + "task_loss": 1.3235254287719727 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.40870916843414307, + "epoch": 7.62, + "learning_rate": 1.3210293979524749e-05, + "loss": 0.5948, + "step": 9017, + "task_loss": 0.27083802223205566 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4331333041191101, + "epoch": 7.62, + "learning_rate": 1.3205597820982438e-05, + "loss": 0.6179, + "step": 9018, + "task_loss": 0.5020532608032227 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6030506491661072, + "epoch": 7.62, + "learning_rate": 1.3200901662440125e-05, + "loss": 0.5212, + "step": 9019, + "task_loss": 0.9635381698608398 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6907272338867188, + "epoch": 7.62, + "learning_rate": 1.3196205503897813e-05, + "loss": 0.5526, + "step": 9020, + "task_loss": 0.7278802990913391 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6707903146743774, + "epoch": 7.63, + "learning_rate": 1.3191509345355499e-05, + "loss": 0.5338, + "step": 9021, + "task_loss": 1.2371171712875366 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9012331962585449, + "epoch": 7.63, + "learning_rate": 1.3186813186813187e-05, + "loss": 0.6654, + "step": 9022, + "task_loss": 0.46501392126083374 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8501212000846863, + "epoch": 7.63, + "learning_rate": 1.3182117028270873e-05, + "loss": 0.7624, + "step": 9023, + "task_loss": 1.4143824577331543 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.659072756767273, + "epoch": 7.63, + "learning_rate": 1.3177420869728563e-05, + "loss": 0.6648, + "step": 9024, + "task_loss": 1.3253391981124878 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7346950769424438, + "epoch": 7.63, + "learning_rate": 1.317272471118625e-05, + "loss": 0.7094, + "step": 9025, + "task_loss": 0.8217813372612 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6128737926483154, + "epoch": 7.63, + "learning_rate": 1.3168028552643938e-05, + "loss": 0.5558, + "step": 9026, + "task_loss": 0.7849315404891968 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5093212127685547, + "epoch": 7.63, + "learning_rate": 1.3163332394101627e-05, + "loss": 0.8112, + "step": 9027, + "task_loss": 0.7542285323143005 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5931057929992676, + "epoch": 7.63, + "learning_rate": 1.3158636235559314e-05, + "loss": 0.5912, + "step": 9028, + "task_loss": 0.3304416537284851 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3930579125881195, + "epoch": 7.63, + "learning_rate": 1.3153940077017002e-05, + "loss": 0.537, + "step": 9029, + "task_loss": 0.412393182516098 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5463581681251526, + "epoch": 7.63, + "learning_rate": 1.3149243918474688e-05, + "loss": 0.6033, + "step": 9030, + "task_loss": 1.3231762647628784 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3858368396759033, + "epoch": 7.63, + "learning_rate": 1.3144547759932378e-05, + "loss": 0.5303, + "step": 9031, + "task_loss": 0.315184623003006 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5829121470451355, + "epoch": 7.63, + "learning_rate": 1.3139851601390062e-05, + "loss": 0.716, + "step": 9032, + "task_loss": 0.6660099625587463 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5190823078155518, + "epoch": 7.64, + "learning_rate": 1.3135155442847752e-05, + "loss": 0.7032, + "step": 9033, + "task_loss": 0.49052760004997253 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.37749361991882324, + "epoch": 7.64, + "learning_rate": 1.3130459284305438e-05, + "loss": 0.5764, + "step": 9034, + "task_loss": 0.10549497604370117 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5722342133522034, + "epoch": 7.64, + "learning_rate": 1.3125763125763126e-05, + "loss": 0.6668, + "step": 9035, + "task_loss": 0.12600325047969818 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6511390209197998, + "epoch": 7.64, + "learning_rate": 1.3121066967220813e-05, + "loss": 0.5532, + "step": 9036, + "task_loss": 0.061101507395505905 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6330512166023254, + "epoch": 7.64, + "learning_rate": 1.3116370808678502e-05, + "loss": 0.5864, + "step": 9037, + "task_loss": 0.4041541516780853 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5505348443984985, + "epoch": 7.64, + "learning_rate": 1.3111674650136189e-05, + "loss": 0.534, + "step": 9038, + "task_loss": 1.0301940441131592 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.40512585639953613, + "epoch": 7.64, + "learning_rate": 1.3106978491593877e-05, + "loss": 0.7018, + "step": 9039, + "task_loss": 0.6876006722450256 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.34805920720100403, + "epoch": 7.64, + "learning_rate": 1.3102282333051563e-05, + "loss": 0.5758, + "step": 9040, + "task_loss": 0.7382710576057434 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6497393846511841, + "epoch": 7.64, + "learning_rate": 1.3097586174509251e-05, + "loss": 0.7256, + "step": 9041, + "task_loss": 0.7922466397285461 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9173288941383362, + "epoch": 7.64, + "learning_rate": 1.3092890015966941e-05, + "loss": 0.844, + "step": 9042, + "task_loss": 0.8858552575111389 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4481070339679718, + "epoch": 7.64, + "learning_rate": 1.3088193857424627e-05, + "loss": 0.5436, + "step": 9043, + "task_loss": 0.7812582850456238 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5777219533920288, + "epoch": 7.64, + "learning_rate": 1.3083497698882315e-05, + "loss": 0.5799, + "step": 9044, + "task_loss": 1.7439266443252563 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9062776565551758, + "epoch": 7.65, + "learning_rate": 1.3078801540340002e-05, + "loss": 0.6698, + "step": 9045, + "task_loss": 1.6864441633224487 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5858691930770874, + "epoch": 7.65, + "learning_rate": 1.3074105381797691e-05, + "loss": 0.6701, + "step": 9046, + "task_loss": 1.0712456703186035 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5898187160491943, + "epoch": 7.65, + "learning_rate": 1.3069409223255378e-05, + "loss": 0.5469, + "step": 9047, + "task_loss": 0.5957937240600586 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6014208793640137, + "epoch": 7.65, + "learning_rate": 1.3064713064713066e-05, + "loss": 0.5643, + "step": 9048, + "task_loss": 0.8048995733261108 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9950630068778992, + "epoch": 7.65, + "learning_rate": 1.3060016906170752e-05, + "loss": 0.6559, + "step": 9049, + "task_loss": 0.7754375338554382 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4926382303237915, + "epoch": 7.65, + "learning_rate": 1.3055320747628442e-05, + "loss": 0.535, + "step": 9050, + "task_loss": 0.26669442653656006 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8775556683540344, + "epoch": 7.65, + "learning_rate": 1.3050624589086126e-05, + "loss": 0.5539, + "step": 9051, + "task_loss": 0.9274231195449829 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5096705555915833, + "epoch": 7.65, + "learning_rate": 1.3045928430543816e-05, + "loss": 0.581, + "step": 9052, + "task_loss": 0.21778860688209534 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5683337450027466, + "epoch": 7.65, + "learning_rate": 1.3041232272001503e-05, + "loss": 0.5138, + "step": 9053, + "task_loss": 0.7031169533729553 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4617474675178528, + "epoch": 7.65, + "learning_rate": 1.303653611345919e-05, + "loss": 0.5944, + "step": 9054, + "task_loss": 0.8714993596076965 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.42280957102775574, + "epoch": 7.65, + "learning_rate": 1.3031839954916877e-05, + "loss": 0.5865, + "step": 9055, + "task_loss": 0.6723194122314453 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5408577919006348, + "epoch": 7.65, + "learning_rate": 1.3027143796374567e-05, + "loss": 0.5282, + "step": 9056, + "task_loss": 0.861878514289856 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8703663945198059, + "epoch": 7.66, + "learning_rate": 1.3022447637832255e-05, + "loss": 0.6433, + "step": 9057, + "task_loss": 0.8380740284919739 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5695338845252991, + "epoch": 7.66, + "learning_rate": 1.3017751479289941e-05, + "loss": 0.6059, + "step": 9058, + "task_loss": 0.5634458065032959 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6485887765884399, + "epoch": 7.66, + "learning_rate": 1.301305532074763e-05, + "loss": 0.5838, + "step": 9059, + "task_loss": 0.19843529164791107 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.580459713935852, + "epoch": 7.66, + "learning_rate": 1.3008359162205317e-05, + "loss": 0.5649, + "step": 9060, + "task_loss": 0.4084705710411072 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6998453140258789, + "epoch": 7.66, + "learning_rate": 1.3003663003663005e-05, + "loss": 0.5327, + "step": 9061, + "task_loss": 0.5208227634429932 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5457407832145691, + "epoch": 7.66, + "learning_rate": 1.2998966845120691e-05, + "loss": 0.6063, + "step": 9062, + "task_loss": 0.47631439566612244 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.863166332244873, + "epoch": 7.66, + "learning_rate": 1.299427068657838e-05, + "loss": 0.8222, + "step": 9063, + "task_loss": 1.0228097438812256 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8334242105484009, + "epoch": 7.66, + "learning_rate": 1.2989574528036066e-05, + "loss": 0.7328, + "step": 9064, + "task_loss": 1.5601552724838257 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9794629812240601, + "epoch": 7.66, + "learning_rate": 1.2984878369493756e-05, + "loss": 0.7113, + "step": 9065, + "task_loss": 0.771183431148529 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.43146729469299316, + "epoch": 7.66, + "learning_rate": 1.2980182210951442e-05, + "loss": 0.605, + "step": 9066, + "task_loss": 0.6143893003463745 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7484097480773926, + "epoch": 7.66, + "learning_rate": 1.297548605240913e-05, + "loss": 0.7027, + "step": 9067, + "task_loss": 1.0117928981781006 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0944373607635498, + "epoch": 7.66, + "learning_rate": 1.2970789893866816e-05, + "loss": 0.7084, + "step": 9068, + "task_loss": 2.175537347793579 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9182159304618835, + "epoch": 7.67, + "learning_rate": 1.2966093735324506e-05, + "loss": 0.68, + "step": 9069, + "task_loss": 0.17643077671527863 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.127352237701416, + "epoch": 7.67, + "learning_rate": 1.2961397576782192e-05, + "loss": 0.7136, + "step": 9070, + "task_loss": 0.9668977856636047 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6797225475311279, + "epoch": 7.67, + "learning_rate": 1.295670141823988e-05, + "loss": 0.5786, + "step": 9071, + "task_loss": 1.035664439201355 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7414901256561279, + "epoch": 7.67, + "learning_rate": 1.295200525969757e-05, + "loss": 0.5827, + "step": 9072, + "task_loss": 0.47591114044189453 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7650946378707886, + "epoch": 7.67, + "learning_rate": 1.2947309101155255e-05, + "loss": 0.7814, + "step": 9073, + "task_loss": 0.4345983862876892 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6888265609741211, + "epoch": 7.67, + "learning_rate": 1.2942612942612944e-05, + "loss": 0.695, + "step": 9074, + "task_loss": 0.7842326164245605 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5396343469619751, + "epoch": 7.67, + "learning_rate": 1.293791678407063e-05, + "loss": 0.6499, + "step": 9075, + "task_loss": 0.781366765499115 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7076486945152283, + "epoch": 7.67, + "learning_rate": 1.2933220625528319e-05, + "loss": 0.7921, + "step": 9076, + "task_loss": 0.8562763929367065 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6688251495361328, + "epoch": 7.67, + "learning_rate": 1.2928524466986005e-05, + "loss": 0.8087, + "step": 9077, + "task_loss": 1.161331057548523 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6165298223495483, + "epoch": 7.67, + "learning_rate": 1.2923828308443695e-05, + "loss": 0.6788, + "step": 9078, + "task_loss": 0.8113792538642883 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9365557432174683, + "epoch": 7.67, + "learning_rate": 1.2919132149901381e-05, + "loss": 0.9046, + "step": 9079, + "task_loss": 1.1113063097000122 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5746587514877319, + "epoch": 7.67, + "learning_rate": 1.291443599135907e-05, + "loss": 0.5086, + "step": 9080, + "task_loss": 0.25513365864753723 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.722360372543335, + "epoch": 7.68, + "learning_rate": 1.2909739832816756e-05, + "loss": 0.5339, + "step": 9081, + "task_loss": 1.0169212818145752 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5195110440254211, + "epoch": 7.68, + "learning_rate": 1.2905043674274445e-05, + "loss": 0.3984, + "step": 9082, + "task_loss": 0.40439295768737793 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5156396627426147, + "epoch": 7.68, + "learning_rate": 1.290034751573213e-05, + "loss": 0.6077, + "step": 9083, + "task_loss": 1.0731475353240967 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5853645205497742, + "epoch": 7.68, + "learning_rate": 1.289565135718982e-05, + "loss": 0.6383, + "step": 9084, + "task_loss": 1.2003706693649292 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4770641624927521, + "epoch": 7.68, + "learning_rate": 1.2890955198647506e-05, + "loss": 0.6225, + "step": 9085, + "task_loss": 0.6634112596511841 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5818284153938293, + "epoch": 7.68, + "learning_rate": 1.2886259040105194e-05, + "loss": 0.5852, + "step": 9086, + "task_loss": 0.9344972968101501 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8461856245994568, + "epoch": 7.68, + "learning_rate": 1.288156288156288e-05, + "loss": 0.8108, + "step": 9087, + "task_loss": 0.39087262749671936 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5091992616653442, + "epoch": 7.68, + "learning_rate": 1.287686672302057e-05, + "loss": 0.6821, + "step": 9088, + "task_loss": 0.7031358480453491 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5288878679275513, + "epoch": 7.68, + "learning_rate": 1.2872170564478258e-05, + "loss": 0.5358, + "step": 9089, + "task_loss": 0.4093533456325531 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.42600560188293457, + "epoch": 7.68, + "learning_rate": 1.2867474405935945e-05, + "loss": 0.5062, + "step": 9090, + "task_loss": 0.409268319606781 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6948527097702026, + "epoch": 7.68, + "learning_rate": 1.2862778247393634e-05, + "loss": 0.6799, + "step": 9091, + "task_loss": 1.1037144660949707 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.48970529437065125, + "epoch": 7.69, + "learning_rate": 1.285808208885132e-05, + "loss": 0.4723, + "step": 9092, + "task_loss": 0.9574061632156372 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7055870294570923, + "epoch": 7.69, + "learning_rate": 1.2853385930309009e-05, + "loss": 0.8125, + "step": 9093, + "task_loss": 1.5533785820007324 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6166523694992065, + "epoch": 7.69, + "learning_rate": 1.2848689771766695e-05, + "loss": 0.4769, + "step": 9094, + "task_loss": 0.5199586749076843 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7846020460128784, + "epoch": 7.69, + "learning_rate": 1.2843993613224383e-05, + "loss": 0.6677, + "step": 9095, + "task_loss": 0.3481350541114807 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6348507404327393, + "epoch": 7.69, + "learning_rate": 1.283929745468207e-05, + "loss": 0.7002, + "step": 9096, + "task_loss": 0.101246677339077 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7383549213409424, + "epoch": 7.69, + "learning_rate": 1.2834601296139759e-05, + "loss": 0.7325, + "step": 9097, + "task_loss": 0.6932706236839294 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6064481735229492, + "epoch": 7.69, + "learning_rate": 1.2829905137597445e-05, + "loss": 0.6677, + "step": 9098, + "task_loss": 0.9959902763366699 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7460986375808716, + "epoch": 7.69, + "learning_rate": 1.2825208979055133e-05, + "loss": 0.8771, + "step": 9099, + "task_loss": 0.35735008120536804 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.31088483333587646, + "epoch": 7.69, + "learning_rate": 1.282051282051282e-05, + "loss": 0.4977, + "step": 9100, + "task_loss": 0.41531816124916077 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4399469196796417, + "epoch": 7.69, + "learning_rate": 1.281581666197051e-05, + "loss": 0.4764, + "step": 9101, + "task_loss": 0.2638051509857178 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5483927726745605, + "epoch": 7.69, + "learning_rate": 1.2811120503428194e-05, + "loss": 0.7473, + "step": 9102, + "task_loss": 0.3054583966732025 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8244616389274597, + "epoch": 7.69, + "learning_rate": 1.2806424344885884e-05, + "loss": 0.6994, + "step": 9103, + "task_loss": 1.0471986532211304 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3091169595718384, + "epoch": 7.7, + "learning_rate": 1.2801728186343574e-05, + "loss": 0.6471, + "step": 9104, + "task_loss": 1.2121503353118896 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5193596482276917, + "epoch": 7.7, + "learning_rate": 1.2797032027801258e-05, + "loss": 0.4325, + "step": 9105, + "task_loss": 0.22953557968139648 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3810580372810364, + "epoch": 7.7, + "learning_rate": 1.2792335869258948e-05, + "loss": 0.5, + "step": 9106, + "task_loss": 0.4794789254665375 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.905901312828064, + "epoch": 7.7, + "learning_rate": 1.2787639710716634e-05, + "loss": 0.674, + "step": 9107, + "task_loss": 0.8848867416381836 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6959438920021057, + "epoch": 7.7, + "learning_rate": 1.2782943552174322e-05, + "loss": 0.6946, + "step": 9108, + "task_loss": 0.8566098213195801 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6713850498199463, + "epoch": 7.7, + "learning_rate": 1.2778247393632009e-05, + "loss": 0.6673, + "step": 9109, + "task_loss": 0.4564402997493744 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0372607707977295, + "epoch": 7.7, + "learning_rate": 1.2773551235089698e-05, + "loss": 0.8259, + "step": 9110, + "task_loss": 1.082775354385376 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.36567747592926025, + "epoch": 7.7, + "learning_rate": 1.2768855076547385e-05, + "loss": 0.639, + "step": 9111, + "task_loss": 0.8418704867362976 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5722159743309021, + "epoch": 7.7, + "learning_rate": 1.2764158918005073e-05, + "loss": 0.4769, + "step": 9112, + "task_loss": 0.3902459144592285 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3350951075553894, + "epoch": 7.7, + "learning_rate": 1.275946275946276e-05, + "loss": 0.3293, + "step": 9113, + "task_loss": 0.6242761015892029 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.48244649171829224, + "epoch": 7.7, + "learning_rate": 1.2754766600920449e-05, + "loss": 0.6871, + "step": 9114, + "task_loss": 0.839927613735199 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.41007760167121887, + "epoch": 7.7, + "learning_rate": 1.2750070442378134e-05, + "loss": 0.6077, + "step": 9115, + "task_loss": 1.0131207704544067 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.44290614128112793, + "epoch": 7.71, + "learning_rate": 1.2745374283835823e-05, + "loss": 0.5402, + "step": 9116, + "task_loss": 0.2894012928009033 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5615799427032471, + "epoch": 7.71, + "learning_rate": 1.274067812529351e-05, + "loss": 0.5994, + "step": 9117, + "task_loss": 0.7510315179824829 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.43828704953193665, + "epoch": 7.71, + "learning_rate": 1.2735981966751198e-05, + "loss": 0.6726, + "step": 9118, + "task_loss": 0.8060537576675415 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.615739643573761, + "epoch": 7.71, + "learning_rate": 1.2731285808208887e-05, + "loss": 0.7048, + "step": 9119, + "task_loss": 0.24548418819904327 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5567545890808105, + "epoch": 7.71, + "learning_rate": 1.2726589649666574e-05, + "loss": 0.5638, + "step": 9120, + "task_loss": 0.4917210340499878 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7014592885971069, + "epoch": 7.71, + "learning_rate": 1.2721893491124262e-05, + "loss": 0.5981, + "step": 9121, + "task_loss": 1.6593323945999146 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.580632209777832, + "epoch": 7.71, + "learning_rate": 1.2717197332581948e-05, + "loss": 0.6642, + "step": 9122, + "task_loss": 0.806998610496521 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4691159725189209, + "epoch": 7.71, + "learning_rate": 1.2712501174039638e-05, + "loss": 0.5977, + "step": 9123, + "task_loss": 1.0986748933792114 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5824064612388611, + "epoch": 7.71, + "learning_rate": 1.2707805015497322e-05, + "loss": 0.621, + "step": 9124, + "task_loss": 1.3755930662155151 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.24288761615753174, + "epoch": 7.71, + "learning_rate": 1.2703108856955012e-05, + "loss": 0.5625, + "step": 9125, + "task_loss": 0.051154494285583496 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4740070700645447, + "epoch": 7.71, + "learning_rate": 1.2698412698412699e-05, + "loss": 0.5192, + "step": 9126, + "task_loss": 0.4311660826206207 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7269397974014282, + "epoch": 7.71, + "learning_rate": 1.2693716539870387e-05, + "loss": 0.6431, + "step": 9127, + "task_loss": 0.9512178897857666 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5863138437271118, + "epoch": 7.72, + "learning_rate": 1.2689020381328073e-05, + "loss": 0.7079, + "step": 9128, + "task_loss": 1.4563648700714111 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.39926332235336304, + "epoch": 7.72, + "learning_rate": 1.2684324222785763e-05, + "loss": 0.5599, + "step": 9129, + "task_loss": 1.1902014017105103 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4026908278465271, + "epoch": 7.72, + "learning_rate": 1.2679628064243449e-05, + "loss": 0.6192, + "step": 9130, + "task_loss": 0.7239860892295837 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4563809335231781, + "epoch": 7.72, + "learning_rate": 1.2674931905701137e-05, + "loss": 0.7672, + "step": 9131, + "task_loss": 0.3404676020145416 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7566729784011841, + "epoch": 7.72, + "learning_rate": 1.2670235747158823e-05, + "loss": 0.6782, + "step": 9132, + "task_loss": 0.42687010765075684 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.18166440725326538, + "epoch": 7.72, + "learning_rate": 1.2665539588616513e-05, + "loss": 0.4507, + "step": 9133, + "task_loss": 0.036593835800886154 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4890053868293762, + "epoch": 7.72, + "learning_rate": 1.2660843430074201e-05, + "loss": 0.7933, + "step": 9134, + "task_loss": 1.1616226434707642 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5537952780723572, + "epoch": 7.72, + "learning_rate": 1.2656147271531887e-05, + "loss": 0.5149, + "step": 9135, + "task_loss": 0.6069802045822144 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5156192779541016, + "epoch": 7.72, + "learning_rate": 1.2651451112989575e-05, + "loss": 0.5399, + "step": 9136, + "task_loss": 1.1199874877929688 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4933267831802368, + "epoch": 7.72, + "learning_rate": 1.2646754954447262e-05, + "loss": 0.6113, + "step": 9137, + "task_loss": 0.4185149073600769 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7207887172698975, + "epoch": 7.72, + "learning_rate": 1.2642058795904952e-05, + "loss": 0.6317, + "step": 9138, + "task_loss": 0.6087276339530945 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.705646276473999, + "epoch": 7.72, + "learning_rate": 1.2637362637362638e-05, + "loss": 0.8563, + "step": 9139, + "task_loss": 1.012412667274475 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5746712684631348, + "epoch": 7.73, + "learning_rate": 1.2632666478820326e-05, + "loss": 0.666, + "step": 9140, + "task_loss": 1.0685789585113525 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.43536290526390076, + "epoch": 7.73, + "learning_rate": 1.2627970320278012e-05, + "loss": 0.5447, + "step": 9141, + "task_loss": 0.5452060103416443 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5394901633262634, + "epoch": 7.73, + "learning_rate": 1.2623274161735702e-05, + "loss": 0.6106, + "step": 9142, + "task_loss": 1.300814151763916 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9475088119506836, + "epoch": 7.73, + "learning_rate": 1.2618578003193388e-05, + "loss": 0.8658, + "step": 9143, + "task_loss": 1.3723053932189941 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.33425259590148926, + "epoch": 7.73, + "learning_rate": 1.2613881844651076e-05, + "loss": 0.5864, + "step": 9144, + "task_loss": 0.7204646468162537 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7013534307479858, + "epoch": 7.73, + "learning_rate": 1.2609185686108763e-05, + "loss": 0.5867, + "step": 9145, + "task_loss": 1.0429000854492188 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3698098063468933, + "epoch": 7.73, + "learning_rate": 1.260448952756645e-05, + "loss": 0.4618, + "step": 9146, + "task_loss": 0.3355768322944641 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7327256202697754, + "epoch": 7.73, + "learning_rate": 1.2599793369024137e-05, + "loss": 0.5611, + "step": 9147, + "task_loss": 0.5533397197723389 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5181212425231934, + "epoch": 7.73, + "learning_rate": 1.2595097210481827e-05, + "loss": 0.5995, + "step": 9148, + "task_loss": 1.3067476749420166 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5996986031532288, + "epoch": 7.73, + "learning_rate": 1.2590401051939515e-05, + "loss": 0.7406, + "step": 9149, + "task_loss": 0.5035154819488525 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6594605445861816, + "epoch": 7.73, + "learning_rate": 1.2585704893397201e-05, + "loss": 0.7067, + "step": 9150, + "task_loss": 1.3502967357635498 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0735517740249634, + "epoch": 7.73, + "learning_rate": 1.2581008734854891e-05, + "loss": 0.8276, + "step": 9151, + "task_loss": 1.7753902673721313 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6830829381942749, + "epoch": 7.74, + "learning_rate": 1.2576312576312577e-05, + "loss": 0.5472, + "step": 9152, + "task_loss": 1.046669363975525 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6330386400222778, + "epoch": 7.74, + "learning_rate": 1.2571616417770265e-05, + "loss": 0.5793, + "step": 9153, + "task_loss": 0.5685707330703735 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3661718964576721, + "epoch": 7.74, + "learning_rate": 1.2566920259227952e-05, + "loss": 0.569, + "step": 9154, + "task_loss": 0.6226862668991089 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3578072190284729, + "epoch": 7.74, + "learning_rate": 1.2562224100685641e-05, + "loss": 0.7324, + "step": 9155, + "task_loss": 0.4164118468761444 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4766775667667389, + "epoch": 7.74, + "learning_rate": 1.2557527942143326e-05, + "loss": 0.5271, + "step": 9156, + "task_loss": 0.6421356201171875 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5595275163650513, + "epoch": 7.74, + "learning_rate": 1.2552831783601016e-05, + "loss": 0.5607, + "step": 9157, + "task_loss": 1.1562862396240234 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.46680909395217896, + "epoch": 7.74, + "learning_rate": 1.2548135625058702e-05, + "loss": 0.5917, + "step": 9158, + "task_loss": 1.1353721618652344 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6528694033622742, + "epoch": 7.74, + "learning_rate": 1.254343946651639e-05, + "loss": 0.6154, + "step": 9159, + "task_loss": 0.8469077348709106 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3327333927154541, + "epoch": 7.74, + "learning_rate": 1.2538743307974076e-05, + "loss": 0.525, + "step": 9160, + "task_loss": 0.3638368248939514 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6519312858581543, + "epoch": 7.74, + "learning_rate": 1.2534047149431766e-05, + "loss": 0.5975, + "step": 9161, + "task_loss": 0.9959648251533508 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7973043918609619, + "epoch": 7.74, + "learning_rate": 1.2529350990889453e-05, + "loss": 0.6715, + "step": 9162, + "task_loss": 2.1171114444732666 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5737801790237427, + "epoch": 7.75, + "learning_rate": 1.252465483234714e-05, + "loss": 0.5112, + "step": 9163, + "task_loss": 0.6324585676193237 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5482625961303711, + "epoch": 7.75, + "learning_rate": 1.2519958673804827e-05, + "loss": 0.564, + "step": 9164, + "task_loss": 1.3259679079055786 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9230426549911499, + "epoch": 7.75, + "learning_rate": 1.2515262515262517e-05, + "loss": 0.7227, + "step": 9165, + "task_loss": 0.6075318455696106 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.47031185030937195, + "epoch": 7.75, + "learning_rate": 1.2510566356720205e-05, + "loss": 0.4521, + "step": 9166, + "task_loss": 0.5812268257141113 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9825012683868408, + "epoch": 7.75, + "learning_rate": 1.2505870198177891e-05, + "loss": 0.6809, + "step": 9167, + "task_loss": 1.353095531463623 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7288048267364502, + "epoch": 7.75, + "learning_rate": 1.2501174039635579e-05, + "loss": 0.6443, + "step": 9168, + "task_loss": 0.7890456914901733 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4178343117237091, + "epoch": 7.75, + "learning_rate": 1.2496477881093265e-05, + "loss": 0.4821, + "step": 9169, + "task_loss": 0.31060272455215454 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4882894456386566, + "epoch": 7.75, + "learning_rate": 1.2491781722550953e-05, + "loss": 0.5343, + "step": 9170, + "task_loss": 0.662736177444458 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5568426847457886, + "epoch": 7.75, + "learning_rate": 1.2487085564008641e-05, + "loss": 0.7393, + "step": 9171, + "task_loss": 0.9950714111328125 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7702128291130066, + "epoch": 7.75, + "learning_rate": 1.2482389405466328e-05, + "loss": 0.7508, + "step": 9172, + "task_loss": 0.5499085783958435 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6226160526275635, + "epoch": 7.75, + "learning_rate": 1.2477693246924017e-05, + "loss": 0.6324, + "step": 9173, + "task_loss": 0.19818687438964844 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8429038524627686, + "epoch": 7.75, + "learning_rate": 1.2472997088381706e-05, + "loss": 0.6864, + "step": 9174, + "task_loss": 0.8326199650764465 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6501444578170776, + "epoch": 7.76, + "learning_rate": 1.2468300929839392e-05, + "loss": 0.5998, + "step": 9175, + "task_loss": 0.8628482222557068 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9112529754638672, + "epoch": 7.76, + "learning_rate": 1.246360477129708e-05, + "loss": 0.6751, + "step": 9176, + "task_loss": 0.5033800601959229 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.559431791305542, + "epoch": 7.76, + "learning_rate": 1.2458908612754768e-05, + "loss": 0.5565, + "step": 9177, + "task_loss": 1.1971118450164795 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4931434392929077, + "epoch": 7.76, + "learning_rate": 1.2454212454212454e-05, + "loss": 0.5114, + "step": 9178, + "task_loss": 0.7337039709091187 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.48618826270103455, + "epoch": 7.76, + "learning_rate": 1.2449516295670142e-05, + "loss": 0.6297, + "step": 9179, + "task_loss": 0.47725483775138855 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8665053844451904, + "epoch": 7.76, + "learning_rate": 1.244482013712783e-05, + "loss": 0.7606, + "step": 9180, + "task_loss": 0.9061046838760376 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4813166558742523, + "epoch": 7.76, + "learning_rate": 1.2440123978585518e-05, + "loss": 0.6378, + "step": 9181, + "task_loss": 0.9006826877593994 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.32420799136161804, + "epoch": 7.76, + "learning_rate": 1.2435427820043205e-05, + "loss": 0.4714, + "step": 9182, + "task_loss": 0.10279767960309982 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6458555459976196, + "epoch": 7.76, + "learning_rate": 1.2430731661500893e-05, + "loss": 0.7872, + "step": 9183, + "task_loss": 0.34597325325012207 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6857365369796753, + "epoch": 7.76, + "learning_rate": 1.242603550295858e-05, + "loss": 0.5362, + "step": 9184, + "task_loss": 0.8836653232574463 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5644246339797974, + "epoch": 7.76, + "learning_rate": 1.2421339344416267e-05, + "loss": 0.6422, + "step": 9185, + "task_loss": 0.2629653215408325 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4994243383407593, + "epoch": 7.76, + "learning_rate": 1.2416643185873955e-05, + "loss": 0.5712, + "step": 9186, + "task_loss": 1.4908876419067383 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5470830202102661, + "epoch": 7.77, + "learning_rate": 1.2411947027331643e-05, + "loss": 0.6192, + "step": 9187, + "task_loss": 0.2291332185268402 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4141067862510681, + "epoch": 7.77, + "learning_rate": 1.2407250868789331e-05, + "loss": 0.498, + "step": 9188, + "task_loss": 0.3309977650642395 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7059606909751892, + "epoch": 7.77, + "learning_rate": 1.240255471024702e-05, + "loss": 0.543, + "step": 9189, + "task_loss": 1.2484396696090698 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6303499937057495, + "epoch": 7.77, + "learning_rate": 1.2397858551704707e-05, + "loss": 0.6173, + "step": 9190, + "task_loss": 0.7463486790657043 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.1179356575012207, + "epoch": 7.77, + "learning_rate": 1.2393162393162394e-05, + "loss": 0.6472, + "step": 9191, + "task_loss": 1.4096165895462036 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3600737452507019, + "epoch": 7.77, + "learning_rate": 1.2388466234620082e-05, + "loss": 0.5546, + "step": 9192, + "task_loss": 0.8780907392501831 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8203255534172058, + "epoch": 7.77, + "learning_rate": 1.238377007607777e-05, + "loss": 0.6403, + "step": 9193, + "task_loss": 1.1304914951324463 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7810653448104858, + "epoch": 7.77, + "learning_rate": 1.2379073917535456e-05, + "loss": 0.649, + "step": 9194, + "task_loss": 1.6029305458068848 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.38124603033065796, + "epoch": 7.77, + "learning_rate": 1.2374377758993144e-05, + "loss": 0.629, + "step": 9195, + "task_loss": 0.05848316103219986 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5188174843788147, + "epoch": 7.77, + "learning_rate": 1.2369681600450832e-05, + "loss": 0.5927, + "step": 9196, + "task_loss": 0.5620988011360168 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7826317548751831, + "epoch": 7.77, + "learning_rate": 1.2364985441908518e-05, + "loss": 0.6421, + "step": 9197, + "task_loss": 1.175435185432434 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.43062925338745117, + "epoch": 7.77, + "learning_rate": 1.2360289283366206e-05, + "loss": 0.5141, + "step": 9198, + "task_loss": 0.4835112392902374 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5537911057472229, + "epoch": 7.78, + "learning_rate": 1.2355593124823895e-05, + "loss": 0.5606, + "step": 9199, + "task_loss": 0.8834670186042786 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5750342607498169, + "epoch": 7.78, + "learning_rate": 1.2350896966281583e-05, + "loss": 0.5963, + "step": 9200, + "task_loss": 1.5788229703903198 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6749097108840942, + "epoch": 7.78, + "learning_rate": 1.2346200807739269e-05, + "loss": 0.8095, + "step": 9201, + "task_loss": 1.3601738214492798 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9442257881164551, + "epoch": 7.78, + "learning_rate": 1.2341504649196957e-05, + "loss": 0.7306, + "step": 9202, + "task_loss": 0.9562928676605225 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5915853381156921, + "epoch": 7.78, + "learning_rate": 1.2336808490654645e-05, + "loss": 0.622, + "step": 9203, + "task_loss": 1.65479576587677 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4696558117866516, + "epoch": 7.78, + "learning_rate": 1.2332112332112333e-05, + "loss": 0.6024, + "step": 9204, + "task_loss": 0.9211612343788147 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7393296957015991, + "epoch": 7.78, + "learning_rate": 1.2327416173570021e-05, + "loss": 0.6212, + "step": 9205, + "task_loss": 0.69806307554245 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6790538430213928, + "epoch": 7.78, + "learning_rate": 1.2322720015027709e-05, + "loss": 0.7038, + "step": 9206, + "task_loss": 0.47628894448280334 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5161414742469788, + "epoch": 7.78, + "learning_rate": 1.2318023856485395e-05, + "loss": 0.5931, + "step": 9207, + "task_loss": 0.6558088064193726 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4002099335193634, + "epoch": 7.78, + "learning_rate": 1.2313327697943083e-05, + "loss": 0.6474, + "step": 9208, + "task_loss": 0.47103366255760193 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.46449607610702515, + "epoch": 7.78, + "learning_rate": 1.2308631539400771e-05, + "loss": 0.632, + "step": 9209, + "task_loss": 0.21072576940059662 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.334917277097702, + "epoch": 7.78, + "learning_rate": 1.2303935380858458e-05, + "loss": 0.5512, + "step": 9210, + "task_loss": 1.0161563158035278 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4747670888900757, + "epoch": 7.79, + "learning_rate": 1.2299239222316146e-05, + "loss": 0.5291, + "step": 9211, + "task_loss": 0.663444459438324 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7980385422706604, + "epoch": 7.79, + "learning_rate": 1.2294543063773834e-05, + "loss": 0.8725, + "step": 9212, + "task_loss": 1.548295021057129 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.40648791193962097, + "epoch": 7.79, + "learning_rate": 1.228984690523152e-05, + "loss": 0.5279, + "step": 9213, + "task_loss": 0.6408501863479614 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.46807074546813965, + "epoch": 7.79, + "learning_rate": 1.2285150746689208e-05, + "loss": 0.6428, + "step": 9214, + "task_loss": 0.21138949692249298 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4050152003765106, + "epoch": 7.79, + "learning_rate": 1.2280454588146896e-05, + "loss": 0.4776, + "step": 9215, + "task_loss": 0.8811978101730347 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5405423641204834, + "epoch": 7.79, + "learning_rate": 1.2275758429604584e-05, + "loss": 0.634, + "step": 9216, + "task_loss": 0.6027246713638306 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5813490748405457, + "epoch": 7.79, + "learning_rate": 1.227106227106227e-05, + "loss": 0.5985, + "step": 9217, + "task_loss": 0.8912213444709778 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9439423084259033, + "epoch": 7.79, + "learning_rate": 1.2266366112519959e-05, + "loss": 0.6345, + "step": 9218, + "task_loss": 1.637570858001709 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3715919554233551, + "epoch": 7.79, + "learning_rate": 1.2261669953977647e-05, + "loss": 0.6343, + "step": 9219, + "task_loss": 0.49685850739479065 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6585158109664917, + "epoch": 7.79, + "learning_rate": 1.2256973795435335e-05, + "loss": 0.5348, + "step": 9220, + "task_loss": 0.808984637260437 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5752423405647278, + "epoch": 7.79, + "learning_rate": 1.2252277636893023e-05, + "loss": 0.7123, + "step": 9221, + "task_loss": 0.29104629158973694 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4112965166568756, + "epoch": 7.79, + "learning_rate": 1.224758147835071e-05, + "loss": 0.6571, + "step": 9222, + "task_loss": 0.479967325925827 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7448669672012329, + "epoch": 7.8, + "learning_rate": 1.2242885319808397e-05, + "loss": 0.5892, + "step": 9223, + "task_loss": 0.7633129358291626 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3633204996585846, + "epoch": 7.8, + "learning_rate": 1.2238189161266085e-05, + "loss": 0.4995, + "step": 9224, + "task_loss": 0.643604576587677 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4923507273197174, + "epoch": 7.8, + "learning_rate": 1.2233493002723773e-05, + "loss": 0.6384, + "step": 9225, + "task_loss": 0.7829862236976624 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5855726003646851, + "epoch": 7.8, + "learning_rate": 1.222879684418146e-05, + "loss": 0.8244, + "step": 9226, + "task_loss": 1.1817216873168945 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6873863339424133, + "epoch": 7.8, + "learning_rate": 1.2224100685639148e-05, + "loss": 0.6626, + "step": 9227, + "task_loss": 1.2474788427352905 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3575519919395447, + "epoch": 7.8, + "learning_rate": 1.2219404527096836e-05, + "loss": 0.4941, + "step": 9228, + "task_loss": 0.30007925629615784 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9238381385803223, + "epoch": 7.8, + "learning_rate": 1.2214708368554522e-05, + "loss": 0.6408, + "step": 9229, + "task_loss": 0.8237901926040649 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3202037811279297, + "epoch": 7.8, + "learning_rate": 1.221001221001221e-05, + "loss": 0.5442, + "step": 9230, + "task_loss": 0.8448647260665894 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.49808260798454285, + "epoch": 7.8, + "learning_rate": 1.2205316051469898e-05, + "loss": 0.5241, + "step": 9231, + "task_loss": 0.41476675868034363 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6203761100769043, + "epoch": 7.8, + "learning_rate": 1.2200619892927586e-05, + "loss": 0.5033, + "step": 9232, + "task_loss": 0.7833760976791382 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7851611375808716, + "epoch": 7.8, + "learning_rate": 1.2195923734385272e-05, + "loss": 0.6598, + "step": 9233, + "task_loss": 1.068286418914795 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8005359768867493, + "epoch": 7.81, + "learning_rate": 1.2191227575842962e-05, + "loss": 0.6553, + "step": 9234, + "task_loss": 0.8778249621391296 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6215698719024658, + "epoch": 7.81, + "learning_rate": 1.2186531417300648e-05, + "loss": 0.6302, + "step": 9235, + "task_loss": 0.5525217056274414 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6312036514282227, + "epoch": 7.81, + "learning_rate": 1.2181835258758337e-05, + "loss": 0.5289, + "step": 9236, + "task_loss": 0.8525466918945312 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6585193872451782, + "epoch": 7.81, + "learning_rate": 1.2177139100216025e-05, + "loss": 0.6005, + "step": 9237, + "task_loss": 1.166562557220459 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8315794467926025, + "epoch": 7.81, + "learning_rate": 1.2172442941673713e-05, + "loss": 0.6606, + "step": 9238, + "task_loss": 0.45985886454582214 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7767733931541443, + "epoch": 7.81, + "learning_rate": 1.2167746783131399e-05, + "loss": 0.4407, + "step": 9239, + "task_loss": 0.7310854196548462 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4326306879520416, + "epoch": 7.81, + "learning_rate": 1.2163050624589087e-05, + "loss": 0.6789, + "step": 9240, + "task_loss": 0.13972207903862 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5320179462432861, + "epoch": 7.81, + "learning_rate": 1.2158354466046775e-05, + "loss": 0.5768, + "step": 9241, + "task_loss": 0.4122759699821472 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6065739393234253, + "epoch": 7.81, + "learning_rate": 1.2153658307504461e-05, + "loss": 0.5502, + "step": 9242, + "task_loss": 0.8673749566078186 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6351373195648193, + "epoch": 7.81, + "learning_rate": 1.214896214896215e-05, + "loss": 0.7261, + "step": 9243, + "task_loss": 0.6195255517959595 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4116397500038147, + "epoch": 7.81, + "learning_rate": 1.2144265990419837e-05, + "loss": 0.4012, + "step": 9244, + "task_loss": 1.0053728818893433 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6081851720809937, + "epoch": 7.81, + "learning_rate": 1.2139569831877524e-05, + "loss": 0.7202, + "step": 9245, + "task_loss": 1.4444200992584229 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4677433967590332, + "epoch": 7.82, + "learning_rate": 1.2134873673335212e-05, + "loss": 0.6233, + "step": 9246, + "task_loss": 0.40733927488327026 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6671141386032104, + "epoch": 7.82, + "learning_rate": 1.21301775147929e-05, + "loss": 0.5914, + "step": 9247, + "task_loss": 0.8897861242294312 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.880736231803894, + "epoch": 7.82, + "learning_rate": 1.2125481356250588e-05, + "loss": 0.7564, + "step": 9248, + "task_loss": 1.4604625701904297 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.48671454191207886, + "epoch": 7.82, + "learning_rate": 1.2120785197708274e-05, + "loss": 0.4974, + "step": 9249, + "task_loss": 0.7679075598716736 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.41997334361076355, + "epoch": 7.82, + "learning_rate": 1.2116089039165964e-05, + "loss": 0.5089, + "step": 9250, + "task_loss": 0.5197562575340271 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6217628121376038, + "epoch": 7.82, + "learning_rate": 1.211139288062365e-05, + "loss": 0.6808, + "step": 9251, + "task_loss": 1.4101309776306152 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6243614554405212, + "epoch": 7.82, + "learning_rate": 1.2106696722081338e-05, + "loss": 0.6501, + "step": 9252, + "task_loss": 1.6311492919921875 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7083710432052612, + "epoch": 7.82, + "learning_rate": 1.2102000563539026e-05, + "loss": 0.7277, + "step": 9253, + "task_loss": 0.5568885207176208 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5752865076065063, + "epoch": 7.82, + "learning_rate": 1.2097304404996714e-05, + "loss": 0.6648, + "step": 9254, + "task_loss": 0.4532527029514313 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8630462288856506, + "epoch": 7.82, + "learning_rate": 1.20926082464544e-05, + "loss": 0.7515, + "step": 9255, + "task_loss": 1.0270709991455078 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5599292516708374, + "epoch": 7.82, + "learning_rate": 1.2087912087912089e-05, + "loss": 0.5382, + "step": 9256, + "task_loss": 0.7215776443481445 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.33189085125923157, + "epoch": 7.82, + "learning_rate": 1.2083215929369777e-05, + "loss": 0.4737, + "step": 9257, + "task_loss": 0.7015829682350159 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.502589225769043, + "epoch": 7.83, + "learning_rate": 1.2078519770827463e-05, + "loss": 0.463, + "step": 9258, + "task_loss": 0.3994985818862915 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.010279655456543, + "epoch": 7.83, + "learning_rate": 1.2073823612285151e-05, + "loss": 0.7966, + "step": 9259, + "task_loss": 1.2055457830429077 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8681115508079529, + "epoch": 7.83, + "learning_rate": 1.206912745374284e-05, + "loss": 0.6622, + "step": 9260, + "task_loss": 1.4711294174194336 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7401294708251953, + "epoch": 7.83, + "learning_rate": 1.2064431295200525e-05, + "loss": 0.6582, + "step": 9261, + "task_loss": 1.0682728290557861 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.2743995189666748, + "epoch": 7.83, + "learning_rate": 1.2059735136658214e-05, + "loss": 0.7289, + "step": 9262, + "task_loss": 0.09796668589115143 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.596108078956604, + "epoch": 7.83, + "learning_rate": 1.2055038978115902e-05, + "loss": 0.4158, + "step": 9263, + "task_loss": 0.6065584421157837 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8598300814628601, + "epoch": 7.83, + "learning_rate": 1.2050342819573588e-05, + "loss": 0.5242, + "step": 9264, + "task_loss": 1.0548279285430908 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.604052722454071, + "epoch": 7.83, + "learning_rate": 1.2045646661031278e-05, + "loss": 0.5993, + "step": 9265, + "task_loss": 0.30261075496673584 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4653339982032776, + "epoch": 7.83, + "learning_rate": 1.2040950502488966e-05, + "loss": 0.7732, + "step": 9266, + "task_loss": 0.20248746871948242 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8494768142700195, + "epoch": 7.83, + "learning_rate": 1.2036254343946652e-05, + "loss": 0.6431, + "step": 9267, + "task_loss": 1.488345980644226 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.478204607963562, + "epoch": 7.83, + "learning_rate": 1.203155818540434e-05, + "loss": 0.6697, + "step": 9268, + "task_loss": 1.0240557193756104 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.37322986125946045, + "epoch": 7.83, + "learning_rate": 1.2026862026862028e-05, + "loss": 0.5097, + "step": 9269, + "task_loss": 0.47234973311424255 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5139741897583008, + "epoch": 7.84, + "learning_rate": 1.2022165868319714e-05, + "loss": 0.5897, + "step": 9270, + "task_loss": 0.07266499102115631 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5163543224334717, + "epoch": 7.84, + "learning_rate": 1.2017469709777402e-05, + "loss": 0.6082, + "step": 9271, + "task_loss": 0.35763078927993774 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8312714695930481, + "epoch": 7.84, + "learning_rate": 1.201277355123509e-05, + "loss": 0.7521, + "step": 9272, + "task_loss": 0.8444193005561829 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6552348136901855, + "epoch": 7.84, + "learning_rate": 1.2008077392692779e-05, + "loss": 0.6107, + "step": 9273, + "task_loss": 1.1751883029937744 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.38688188791275024, + "epoch": 7.84, + "learning_rate": 1.2003381234150465e-05, + "loss": 0.4258, + "step": 9274, + "task_loss": 0.9498867988586426 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8224754333496094, + "epoch": 7.84, + "learning_rate": 1.1998685075608153e-05, + "loss": 0.6323, + "step": 9275, + "task_loss": 1.7587980031967163 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5195527672767639, + "epoch": 7.84, + "learning_rate": 1.1993988917065841e-05, + "loss": 0.6414, + "step": 9276, + "task_loss": 0.705010712146759 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6050170063972473, + "epoch": 7.84, + "learning_rate": 1.1989292758523527e-05, + "loss": 0.4825, + "step": 9277, + "task_loss": 0.8411605358123779 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3979472815990448, + "epoch": 7.84, + "learning_rate": 1.1984596599981215e-05, + "loss": 0.5386, + "step": 9278, + "task_loss": 0.5219746828079224 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.2698785662651062, + "epoch": 7.84, + "learning_rate": 1.1979900441438903e-05, + "loss": 0.4869, + "step": 9279, + "task_loss": 0.09238096326589584 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5532017350196838, + "epoch": 7.84, + "learning_rate": 1.1975204282896591e-05, + "loss": 0.5909, + "step": 9280, + "task_loss": 0.26685094833374023 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7017921805381775, + "epoch": 7.84, + "learning_rate": 1.197050812435428e-05, + "loss": 0.5401, + "step": 9281, + "task_loss": 0.7231501936912537 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6045128703117371, + "epoch": 7.85, + "learning_rate": 1.1965811965811967e-05, + "loss": 0.6947, + "step": 9282, + "task_loss": 0.5414072871208191 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4864766597747803, + "epoch": 7.85, + "learning_rate": 1.1961115807269654e-05, + "loss": 0.4459, + "step": 9283, + "task_loss": 1.333050012588501 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7981334924697876, + "epoch": 7.85, + "learning_rate": 1.1956419648727342e-05, + "loss": 0.6235, + "step": 9284, + "task_loss": 0.6602625250816345 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.760779619216919, + "epoch": 7.85, + "learning_rate": 1.195172349018503e-05, + "loss": 0.6774, + "step": 9285, + "task_loss": 0.7662138938903809 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8918028473854065, + "epoch": 7.85, + "learning_rate": 1.1947027331642716e-05, + "loss": 0.7905, + "step": 9286, + "task_loss": 0.8125105500221252 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4679436981678009, + "epoch": 7.85, + "learning_rate": 1.1942331173100404e-05, + "loss": 0.5304, + "step": 9287, + "task_loss": 0.6071472764015198 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.49994805455207825, + "epoch": 7.85, + "learning_rate": 1.1937635014558092e-05, + "loss": 0.5519, + "step": 9288, + "task_loss": 1.2786537408828735 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5234246850013733, + "epoch": 7.85, + "learning_rate": 1.193293885601578e-05, + "loss": 0.5673, + "step": 9289, + "task_loss": 0.4634457528591156 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.49539893865585327, + "epoch": 7.85, + "learning_rate": 1.1928242697473467e-05, + "loss": 0.5259, + "step": 9290, + "task_loss": 0.938583493232727 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.45192965865135193, + "epoch": 7.85, + "learning_rate": 1.1923546538931155e-05, + "loss": 0.4154, + "step": 9291, + "task_loss": 0.4525604844093323 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.66815185546875, + "epoch": 7.85, + "learning_rate": 1.1918850380388843e-05, + "loss": 0.5826, + "step": 9292, + "task_loss": 1.1670300960540771 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6612383723258972, + "epoch": 7.85, + "learning_rate": 1.1914154221846529e-05, + "loss": 0.564, + "step": 9293, + "task_loss": 0.3871161639690399 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.44947350025177, + "epoch": 7.86, + "learning_rate": 1.1909458063304217e-05, + "loss": 0.5045, + "step": 9294, + "task_loss": 0.7586336135864258 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.50494384765625, + "epoch": 7.86, + "learning_rate": 1.1904761904761905e-05, + "loss": 0.5486, + "step": 9295, + "task_loss": 0.4372828006744385 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5112669467926025, + "epoch": 7.86, + "learning_rate": 1.1900065746219593e-05, + "loss": 0.5172, + "step": 9296, + "task_loss": 0.48432207107543945 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7695194482803345, + "epoch": 7.86, + "learning_rate": 1.1895369587677281e-05, + "loss": 0.71, + "step": 9297, + "task_loss": 0.7044280767440796 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9119033813476562, + "epoch": 7.86, + "learning_rate": 1.189067342913497e-05, + "loss": 0.6785, + "step": 9298, + "task_loss": 0.6933794617652893 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8601858019828796, + "epoch": 7.86, + "learning_rate": 1.1885977270592656e-05, + "loss": 0.699, + "step": 9299, + "task_loss": 1.2532027959823608 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7136693596839905, + "epoch": 7.86, + "learning_rate": 1.1881281112050344e-05, + "loss": 0.6999, + "step": 9300, + "task_loss": 1.7915784120559692 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5366383790969849, + "epoch": 7.86, + "learning_rate": 1.1876584953508032e-05, + "loss": 0.5622, + "step": 9301, + "task_loss": 0.46840882301330566 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.568282425403595, + "epoch": 7.86, + "learning_rate": 1.1871888794965718e-05, + "loss": 0.5758, + "step": 9302, + "task_loss": 1.2255793809890747 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5616037845611572, + "epoch": 7.86, + "learning_rate": 1.1867192636423406e-05, + "loss": 0.506, + "step": 9303, + "task_loss": 0.5363072752952576 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4846736490726471, + "epoch": 7.86, + "learning_rate": 1.1862496477881094e-05, + "loss": 0.63, + "step": 9304, + "task_loss": 1.1016818284988403 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6819823980331421, + "epoch": 7.87, + "learning_rate": 1.1857800319338782e-05, + "loss": 0.6388, + "step": 9305, + "task_loss": 0.6065698266029358 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7092506289482117, + "epoch": 7.87, + "learning_rate": 1.1853104160796468e-05, + "loss": 0.64, + "step": 9306, + "task_loss": 1.0774624347686768 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9463523626327515, + "epoch": 7.87, + "learning_rate": 1.1848408002254156e-05, + "loss": 0.7726, + "step": 9307, + "task_loss": 1.0818854570388794 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5058406591415405, + "epoch": 7.87, + "learning_rate": 1.1843711843711844e-05, + "loss": 0.5991, + "step": 9308, + "task_loss": 0.598130464553833 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8943616151809692, + "epoch": 7.87, + "learning_rate": 1.183901568516953e-05, + "loss": 0.8202, + "step": 9309, + "task_loss": 0.7584429979324341 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.380227267742157, + "epoch": 7.87, + "learning_rate": 1.1834319526627219e-05, + "loss": 0.5901, + "step": 9310, + "task_loss": 0.9526321291923523 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6819581985473633, + "epoch": 7.87, + "learning_rate": 1.1829623368084909e-05, + "loss": 0.7759, + "step": 9311, + "task_loss": 1.0904330015182495 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6405022144317627, + "epoch": 7.87, + "learning_rate": 1.1824927209542595e-05, + "loss": 0.7902, + "step": 9312, + "task_loss": 0.9204011559486389 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.521255612373352, + "epoch": 7.87, + "learning_rate": 1.1820231051000283e-05, + "loss": 0.6348, + "step": 9313, + "task_loss": 0.7240114808082581 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.490625262260437, + "epoch": 7.87, + "learning_rate": 1.1815534892457971e-05, + "loss": 0.7213, + "step": 9314, + "task_loss": 0.6157011389732361 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.38426339626312256, + "epoch": 7.87, + "learning_rate": 1.1810838733915657e-05, + "loss": 0.6538, + "step": 9315, + "task_loss": 0.8507285118103027 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3548632264137268, + "epoch": 7.87, + "learning_rate": 1.1806142575373345e-05, + "loss": 0.5257, + "step": 9316, + "task_loss": 0.371065229177475 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0167105197906494, + "epoch": 7.88, + "learning_rate": 1.1801446416831033e-05, + "loss": 0.7179, + "step": 9317, + "task_loss": 1.4678363800048828 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.687595784664154, + "epoch": 7.88, + "learning_rate": 1.179675025828872e-05, + "loss": 0.7365, + "step": 9318, + "task_loss": 0.7651093006134033 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6699698567390442, + "epoch": 7.88, + "learning_rate": 1.1792054099746408e-05, + "loss": 0.7491, + "step": 9319, + "task_loss": 0.6311319470405579 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.34880444407463074, + "epoch": 7.88, + "learning_rate": 1.1787357941204096e-05, + "loss": 0.6028, + "step": 9320, + "task_loss": 0.5314533710479736 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5252175331115723, + "epoch": 7.88, + "learning_rate": 1.1782661782661784e-05, + "loss": 0.5801, + "step": 9321, + "task_loss": 0.8142345547676086 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9274237155914307, + "epoch": 7.88, + "learning_rate": 1.177796562411947e-05, + "loss": 0.6422, + "step": 9322, + "task_loss": 0.4044482111930847 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3893706202507019, + "epoch": 7.88, + "learning_rate": 1.1773269465577158e-05, + "loss": 0.5816, + "step": 9323, + "task_loss": 0.5028839111328125 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6463404297828674, + "epoch": 7.88, + "learning_rate": 1.1768573307034846e-05, + "loss": 0.5668, + "step": 9324, + "task_loss": 1.439249873161316 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.2882609963417053, + "epoch": 7.88, + "learning_rate": 1.1763877148492533e-05, + "loss": 0.5096, + "step": 9325, + "task_loss": 0.9778980016708374 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9949068427085876, + "epoch": 7.88, + "learning_rate": 1.175918098995022e-05, + "loss": 0.7547, + "step": 9326, + "task_loss": 1.558409571647644 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5672178864479065, + "epoch": 7.88, + "learning_rate": 1.175448483140791e-05, + "loss": 0.683, + "step": 9327, + "task_loss": 0.4719322621822357 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7025935053825378, + "epoch": 7.88, + "learning_rate": 1.1749788672865597e-05, + "loss": 0.6788, + "step": 9328, + "task_loss": 1.3948453664779663 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4702918529510498, + "epoch": 7.89, + "learning_rate": 1.1745092514323285e-05, + "loss": 0.5737, + "step": 9329, + "task_loss": 1.1162681579589844 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3098708987236023, + "epoch": 7.89, + "learning_rate": 1.1740396355780973e-05, + "loss": 0.5893, + "step": 9330, + "task_loss": 0.4862585961818695 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6313767433166504, + "epoch": 7.89, + "learning_rate": 1.1735700197238659e-05, + "loss": 0.5906, + "step": 9331, + "task_loss": 0.3238650858402252 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5914211869239807, + "epoch": 7.89, + "learning_rate": 1.1731004038696347e-05, + "loss": 0.794, + "step": 9332, + "task_loss": 0.7812368869781494 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6282732486724854, + "epoch": 7.89, + "learning_rate": 1.1726307880154035e-05, + "loss": 0.4491, + "step": 9333, + "task_loss": 1.3304002285003662 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6539148092269897, + "epoch": 7.89, + "learning_rate": 1.1721611721611721e-05, + "loss": 0.6382, + "step": 9334, + "task_loss": 0.5356806516647339 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3320097029209137, + "epoch": 7.89, + "learning_rate": 1.171691556306941e-05, + "loss": 0.59, + "step": 9335, + "task_loss": 0.6513996124267578 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9658387899398804, + "epoch": 7.89, + "learning_rate": 1.1712219404527098e-05, + "loss": 0.7695, + "step": 9336, + "task_loss": 1.1512246131896973 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.22772479057312012, + "epoch": 7.89, + "learning_rate": 1.1707523245984786e-05, + "loss": 0.5025, + "step": 9337, + "task_loss": 0.341417521238327 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6229044198989868, + "epoch": 7.89, + "learning_rate": 1.1702827087442472e-05, + "loss": 0.7288, + "step": 9338, + "task_loss": 1.080311894416809 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9156361818313599, + "epoch": 7.89, + "learning_rate": 1.169813092890016e-05, + "loss": 0.7711, + "step": 9339, + "task_loss": 1.792427659034729 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4877018928527832, + "epoch": 7.89, + "learning_rate": 1.1693434770357848e-05, + "loss": 0.4946, + "step": 9340, + "task_loss": 0.690035343170166 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3781812787055969, + "epoch": 7.9, + "learning_rate": 1.1688738611815534e-05, + "loss": 0.4752, + "step": 9341, + "task_loss": 0.04009866714477539 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3604905605316162, + "epoch": 7.9, + "learning_rate": 1.1684042453273224e-05, + "loss": 0.5822, + "step": 9342, + "task_loss": 0.21230128407478333 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.48242148756980896, + "epoch": 7.9, + "learning_rate": 1.1679346294730912e-05, + "loss": 0.6545, + "step": 9343, + "task_loss": 0.9765806198120117 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6387937664985657, + "epoch": 7.9, + "learning_rate": 1.1674650136188598e-05, + "loss": 0.7311, + "step": 9344, + "task_loss": 0.7566108107566833 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6157197952270508, + "epoch": 7.9, + "learning_rate": 1.1669953977646286e-05, + "loss": 0.6148, + "step": 9345, + "task_loss": 0.24975921213626862 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6847743988037109, + "epoch": 7.9, + "learning_rate": 1.1665257819103974e-05, + "loss": 0.6077, + "step": 9346, + "task_loss": 1.2689799070358276 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.40344005823135376, + "epoch": 7.9, + "learning_rate": 1.166056166056166e-05, + "loss": 0.4981, + "step": 9347, + "task_loss": 0.3088672459125519 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7288113832473755, + "epoch": 7.9, + "learning_rate": 1.1655865502019349e-05, + "loss": 0.6156, + "step": 9348, + "task_loss": 0.571753978729248 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7345532178878784, + "epoch": 7.9, + "learning_rate": 1.1651169343477037e-05, + "loss": 0.6343, + "step": 9349, + "task_loss": 0.8760285973548889 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.45350104570388794, + "epoch": 7.9, + "learning_rate": 1.1646473184934723e-05, + "loss": 0.6294, + "step": 9350, + "task_loss": 0.6747958660125732 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3357434868812561, + "epoch": 7.9, + "learning_rate": 1.1641777026392411e-05, + "loss": 0.5282, + "step": 9351, + "task_loss": 0.089142806828022 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6920523643493652, + "epoch": 7.9, + "learning_rate": 1.16370808678501e-05, + "loss": 0.5854, + "step": 9352, + "task_loss": 0.7401637434959412 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8352468609809875, + "epoch": 7.91, + "learning_rate": 1.1632384709307786e-05, + "loss": 0.5755, + "step": 9353, + "task_loss": 0.8834419846534729 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5791182518005371, + "epoch": 7.91, + "learning_rate": 1.1627688550765474e-05, + "loss": 0.6237, + "step": 9354, + "task_loss": 0.8880972862243652 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3421515226364136, + "epoch": 7.91, + "learning_rate": 1.1622992392223162e-05, + "loss": 0.5417, + "step": 9355, + "task_loss": 0.424744188785553 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5214434862136841, + "epoch": 7.91, + "learning_rate": 1.161829623368085e-05, + "loss": 0.7062, + "step": 9356, + "task_loss": 0.5954174995422363 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5210024118423462, + "epoch": 7.91, + "learning_rate": 1.1613600075138538e-05, + "loss": 0.6194, + "step": 9357, + "task_loss": 0.7415909171104431 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6676424741744995, + "epoch": 7.91, + "learning_rate": 1.1608903916596226e-05, + "loss": 0.6732, + "step": 9358, + "task_loss": 0.5274196863174438 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8020111322402954, + "epoch": 7.91, + "learning_rate": 1.1604207758053912e-05, + "loss": 0.6354, + "step": 9359, + "task_loss": 0.14721661806106567 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3807380795478821, + "epoch": 7.91, + "learning_rate": 1.15995115995116e-05, + "loss": 0.4464, + "step": 9360, + "task_loss": 0.637386679649353 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.922400176525116, + "epoch": 7.91, + "learning_rate": 1.1594815440969288e-05, + "loss": 0.6942, + "step": 9361, + "task_loss": 1.6940990686416626 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.39947760105133057, + "epoch": 7.91, + "learning_rate": 1.1590119282426976e-05, + "loss": 0.593, + "step": 9362, + "task_loss": 0.5549744367599487 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3199881315231323, + "epoch": 7.91, + "learning_rate": 1.1585423123884663e-05, + "loss": 0.4834, + "step": 9363, + "task_loss": 0.2715682089328766 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6301392316818237, + "epoch": 7.91, + "learning_rate": 1.158072696534235e-05, + "loss": 0.6499, + "step": 9364, + "task_loss": 0.257243812084198 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7888371348381042, + "epoch": 7.92, + "learning_rate": 1.1576030806800039e-05, + "loss": 0.5809, + "step": 9365, + "task_loss": 0.987948477268219 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8416552543640137, + "epoch": 7.92, + "learning_rate": 1.1571334648257725e-05, + "loss": 0.4506, + "step": 9366, + "task_loss": 0.6163206100463867 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6055905818939209, + "epoch": 7.92, + "learning_rate": 1.1566638489715413e-05, + "loss": 0.5845, + "step": 9367, + "task_loss": 0.5403327941894531 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4669856131076813, + "epoch": 7.92, + "learning_rate": 1.1561942331173101e-05, + "loss": 0.6346, + "step": 9368, + "task_loss": 0.7165498733520508 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6917380094528198, + "epoch": 7.92, + "learning_rate": 1.1557246172630787e-05, + "loss": 0.6259, + "step": 9369, + "task_loss": 1.0885255336761475 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5442982316017151, + "epoch": 7.92, + "learning_rate": 1.1552550014088475e-05, + "loss": 0.602, + "step": 9370, + "task_loss": 0.3081853687763214 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4410777688026428, + "epoch": 7.92, + "learning_rate": 1.1547853855546163e-05, + "loss": 0.5903, + "step": 9371, + "task_loss": 0.9331405758857727 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4594109058380127, + "epoch": 7.92, + "learning_rate": 1.1543157697003852e-05, + "loss": 0.6657, + "step": 9372, + "task_loss": 0.48760098218917847 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5494014620780945, + "epoch": 7.92, + "learning_rate": 1.153846153846154e-05, + "loss": 0.7147, + "step": 9373, + "task_loss": 1.1481456756591797 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.791900634765625, + "epoch": 7.92, + "learning_rate": 1.1533765379919228e-05, + "loss": 0.5603, + "step": 9374, + "task_loss": 1.0063822269439697 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5902163982391357, + "epoch": 7.92, + "learning_rate": 1.1529069221376914e-05, + "loss": 0.8235, + "step": 9375, + "task_loss": 1.0874119997024536 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6441088914871216, + "epoch": 7.93, + "learning_rate": 1.1524373062834602e-05, + "loss": 0.7089, + "step": 9376, + "task_loss": 1.329241394996643 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.41410893201828003, + "epoch": 7.93, + "learning_rate": 1.151967690429229e-05, + "loss": 0.7748, + "step": 9377, + "task_loss": 0.3507739305496216 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8321547508239746, + "epoch": 7.93, + "learning_rate": 1.1514980745749978e-05, + "loss": 0.6307, + "step": 9378, + "task_loss": 1.0516330003738403 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5715767741203308, + "epoch": 7.93, + "learning_rate": 1.1510284587207664e-05, + "loss": 0.6346, + "step": 9379, + "task_loss": 1.2552374601364136 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5960497856140137, + "epoch": 7.93, + "learning_rate": 1.1505588428665352e-05, + "loss": 0.562, + "step": 9380, + "task_loss": 1.1063677072525024 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5169216394424438, + "epoch": 7.93, + "learning_rate": 1.150089227012304e-05, + "loss": 0.5013, + "step": 9381, + "task_loss": 0.05260728299617767 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.42139527201652527, + "epoch": 7.93, + "learning_rate": 1.1496196111580727e-05, + "loss": 0.4781, + "step": 9382, + "task_loss": 0.3015449047088623 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8397413492202759, + "epoch": 7.93, + "learning_rate": 1.1491499953038415e-05, + "loss": 0.7093, + "step": 9383, + "task_loss": 0.820940375328064 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.632329523563385, + "epoch": 7.93, + "learning_rate": 1.1486803794496103e-05, + "loss": 0.602, + "step": 9384, + "task_loss": 0.8939549326896667 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.403773695230484, + "epoch": 7.93, + "learning_rate": 1.148210763595379e-05, + "loss": 0.5262, + "step": 9385, + "task_loss": 0.7638610005378723 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.41257116198539734, + "epoch": 7.93, + "learning_rate": 1.1477411477411477e-05, + "loss": 0.5895, + "step": 9386, + "task_loss": 1.3344825506210327 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3829728960990906, + "epoch": 7.93, + "learning_rate": 1.1472715318869165e-05, + "loss": 0.5192, + "step": 9387, + "task_loss": 0.5396255850791931 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.46532174944877625, + "epoch": 7.94, + "learning_rate": 1.1468019160326853e-05, + "loss": 0.606, + "step": 9388, + "task_loss": 1.5290993452072144 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5130600333213806, + "epoch": 7.94, + "learning_rate": 1.1463323001784541e-05, + "loss": 0.4386, + "step": 9389, + "task_loss": 0.2596134841442108 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9112262725830078, + "epoch": 7.94, + "learning_rate": 1.145862684324223e-05, + "loss": 0.5762, + "step": 9390, + "task_loss": 0.5521166324615479 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5623219609260559, + "epoch": 7.94, + "learning_rate": 1.1453930684699916e-05, + "loss": 0.5641, + "step": 9391, + "task_loss": 1.2574126720428467 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3407719135284424, + "epoch": 7.94, + "learning_rate": 1.1449234526157604e-05, + "loss": 0.6748, + "step": 9392, + "task_loss": 0.33723169565200806 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5623328685760498, + "epoch": 7.94, + "learning_rate": 1.1444538367615292e-05, + "loss": 0.5412, + "step": 9393, + "task_loss": 0.6533262729644775 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3463544249534607, + "epoch": 7.94, + "learning_rate": 1.143984220907298e-05, + "loss": 0.5566, + "step": 9394, + "task_loss": 0.07804758101701736 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.1490049362182617, + "epoch": 7.94, + "learning_rate": 1.1435146050530666e-05, + "loss": 0.7712, + "step": 9395, + "task_loss": 1.5012118816375732 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5321285724639893, + "epoch": 7.94, + "learning_rate": 1.1430449891988354e-05, + "loss": 0.534, + "step": 9396, + "task_loss": 1.1554899215698242 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.615803062915802, + "epoch": 7.94, + "learning_rate": 1.1425753733446042e-05, + "loss": 0.5118, + "step": 9397, + "task_loss": 0.6349579691886902 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.65102219581604, + "epoch": 7.94, + "learning_rate": 1.1421057574903729e-05, + "loss": 0.7144, + "step": 9398, + "task_loss": 2.310368776321411 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6885597705841064, + "epoch": 7.94, + "learning_rate": 1.1416361416361417e-05, + "loss": 0.6439, + "step": 9399, + "task_loss": 0.5299830436706543 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6956585645675659, + "epoch": 7.95, + "learning_rate": 1.1411665257819105e-05, + "loss": 0.6378, + "step": 9400, + "task_loss": 1.042489767074585 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.23922953009605408, + "epoch": 7.95, + "learning_rate": 1.1406969099276791e-05, + "loss": 0.4312, + "step": 9401, + "task_loss": 0.23245881497859955 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4080907106399536, + "epoch": 7.95, + "learning_rate": 1.1402272940734479e-05, + "loss": 0.6091, + "step": 9402, + "task_loss": 1.4790080785751343 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6414295434951782, + "epoch": 7.95, + "learning_rate": 1.1397576782192167e-05, + "loss": 0.7335, + "step": 9403, + "task_loss": 0.5174880623817444 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6065413951873779, + "epoch": 7.95, + "learning_rate": 1.1392880623649855e-05, + "loss": 0.5987, + "step": 9404, + "task_loss": 0.7833905816078186 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3254406154155731, + "epoch": 7.95, + "learning_rate": 1.1388184465107543e-05, + "loss": 0.6456, + "step": 9405, + "task_loss": 0.5428582429885864 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7399395704269409, + "epoch": 7.95, + "learning_rate": 1.1383488306565231e-05, + "loss": 0.7147, + "step": 9406, + "task_loss": 0.567611813545227 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.368856281042099, + "epoch": 7.95, + "learning_rate": 1.1378792148022917e-05, + "loss": 0.6076, + "step": 9407, + "task_loss": 0.36072567105293274 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4096352159976959, + "epoch": 7.95, + "learning_rate": 1.1374095989480605e-05, + "loss": 0.5723, + "step": 9408, + "task_loss": 0.3181588649749756 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5841010808944702, + "epoch": 7.95, + "learning_rate": 1.1369399830938294e-05, + "loss": 0.5614, + "step": 9409, + "task_loss": 0.4596448838710785 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3662697970867157, + "epoch": 7.95, + "learning_rate": 1.1364703672395982e-05, + "loss": 0.5684, + "step": 9410, + "task_loss": 0.47355592250823975 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5822714567184448, + "epoch": 7.95, + "learning_rate": 1.1360007513853668e-05, + "loss": 0.48, + "step": 9411, + "task_loss": 0.30053797364234924 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5498231053352356, + "epoch": 7.96, + "learning_rate": 1.1355311355311356e-05, + "loss": 0.6215, + "step": 9412, + "task_loss": 0.8459458351135254 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.2722030282020569, + "epoch": 7.96, + "learning_rate": 1.1350615196769044e-05, + "loss": 0.4228, + "step": 9413, + "task_loss": 0.3971829414367676 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6278061270713806, + "epoch": 7.96, + "learning_rate": 1.134591903822673e-05, + "loss": 0.7216, + "step": 9414, + "task_loss": 0.3004898428916931 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4071553647518158, + "epoch": 7.96, + "learning_rate": 1.1341222879684418e-05, + "loss": 0.6367, + "step": 9415, + "task_loss": 1.531945824623108 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.48915520310401917, + "epoch": 7.96, + "learning_rate": 1.1336526721142106e-05, + "loss": 0.5535, + "step": 9416, + "task_loss": 0.45885297656059265 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6418141722679138, + "epoch": 7.96, + "learning_rate": 1.1331830562599793e-05, + "loss": 0.5335, + "step": 9417, + "task_loss": 0.2028040885925293 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.558367133140564, + "epoch": 7.96, + "learning_rate": 1.132713440405748e-05, + "loss": 0.6851, + "step": 9418, + "task_loss": 0.893747091293335 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0176935195922852, + "epoch": 7.96, + "learning_rate": 1.132243824551517e-05, + "loss": 0.7962, + "step": 9419, + "task_loss": 1.1557059288024902 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5256329774856567, + "epoch": 7.96, + "learning_rate": 1.1317742086972857e-05, + "loss": 0.5362, + "step": 9420, + "task_loss": 0.5842135548591614 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5086922645568848, + "epoch": 7.96, + "learning_rate": 1.1313045928430545e-05, + "loss": 0.5344, + "step": 9421, + "task_loss": 0.28273290395736694 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.572037935256958, + "epoch": 7.96, + "learning_rate": 1.1308349769888233e-05, + "loss": 0.8839, + "step": 9422, + "task_loss": 1.2476005554199219 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7086217403411865, + "epoch": 7.96, + "learning_rate": 1.130365361134592e-05, + "loss": 0.5545, + "step": 9423, + "task_loss": 1.0698505640029907 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6112680435180664, + "epoch": 7.97, + "learning_rate": 1.1298957452803607e-05, + "loss": 0.6292, + "step": 9424, + "task_loss": 0.6469340920448303 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0409263372421265, + "epoch": 7.97, + "learning_rate": 1.1294261294261295e-05, + "loss": 0.6465, + "step": 9425, + "task_loss": 1.114302396774292 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4348999261856079, + "epoch": 7.97, + "learning_rate": 1.1289565135718982e-05, + "loss": 0.5127, + "step": 9426, + "task_loss": 0.338985413312912 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7122441530227661, + "epoch": 7.97, + "learning_rate": 1.128486897717667e-05, + "loss": 0.762, + "step": 9427, + "task_loss": 0.9999758005142212 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8094573020935059, + "epoch": 7.97, + "learning_rate": 1.1280172818634358e-05, + "loss": 0.7401, + "step": 9428, + "task_loss": 0.7292618155479431 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4456721544265747, + "epoch": 7.97, + "learning_rate": 1.1275476660092046e-05, + "loss": 0.5867, + "step": 9429, + "task_loss": 0.35794416069984436 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7528291940689087, + "epoch": 7.97, + "learning_rate": 1.1270780501549732e-05, + "loss": 0.5665, + "step": 9430, + "task_loss": 1.2289931774139404 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.44323283433914185, + "epoch": 7.97, + "learning_rate": 1.126608434300742e-05, + "loss": 0.6618, + "step": 9431, + "task_loss": 0.07240094989538193 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7988613247871399, + "epoch": 7.97, + "learning_rate": 1.1261388184465108e-05, + "loss": 0.5656, + "step": 9432, + "task_loss": 0.245982825756073 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.042212963104248, + "epoch": 7.97, + "learning_rate": 1.1256692025922794e-05, + "loss": 0.8128, + "step": 9433, + "task_loss": 1.2321810722351074 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5708528757095337, + "epoch": 7.97, + "learning_rate": 1.1251995867380482e-05, + "loss": 0.5911, + "step": 9434, + "task_loss": 0.7063968181610107 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.405477911233902, + "epoch": 7.97, + "learning_rate": 1.1247299708838172e-05, + "loss": 0.7664, + "step": 9435, + "task_loss": 0.0916319265961647 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6326366066932678, + "epoch": 7.98, + "learning_rate": 1.1242603550295859e-05, + "loss": 0.8127, + "step": 9436, + "task_loss": 0.8660032153129578 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9162658452987671, + "epoch": 7.98, + "learning_rate": 1.1237907391753547e-05, + "loss": 0.7986, + "step": 9437, + "task_loss": 0.9233136773109436 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4461521804332733, + "epoch": 7.98, + "learning_rate": 1.1233211233211235e-05, + "loss": 0.4778, + "step": 9438, + "task_loss": 0.6265773773193359 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4681202173233032, + "epoch": 7.98, + "learning_rate": 1.1228515074668921e-05, + "loss": 0.5978, + "step": 9439, + "task_loss": 0.5657489895820618 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6269662380218506, + "epoch": 7.98, + "learning_rate": 1.1223818916126609e-05, + "loss": 0.5584, + "step": 9440, + "task_loss": 1.0935978889465332 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.40904706716537476, + "epoch": 7.98, + "learning_rate": 1.1219122757584297e-05, + "loss": 0.4613, + "step": 9441, + "task_loss": 0.4207878112792969 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.39742425084114075, + "epoch": 7.98, + "learning_rate": 1.1214426599041983e-05, + "loss": 0.6752, + "step": 9442, + "task_loss": 1.166953444480896 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6442578434944153, + "epoch": 7.98, + "learning_rate": 1.1209730440499671e-05, + "loss": 0.699, + "step": 9443, + "task_loss": 1.2483375072479248 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3225404620170593, + "epoch": 7.98, + "learning_rate": 1.120503428195736e-05, + "loss": 0.4682, + "step": 9444, + "task_loss": 0.07836078107357025 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3078013062477112, + "epoch": 7.98, + "learning_rate": 1.1200338123415047e-05, + "loss": 0.57, + "step": 9445, + "task_loss": 0.7894362807273865 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7230902910232544, + "epoch": 7.98, + "learning_rate": 1.1195641964872734e-05, + "loss": 0.533, + "step": 9446, + "task_loss": 1.3660478591918945 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.709510087966919, + "epoch": 7.99, + "learning_rate": 1.1190945806330422e-05, + "loss": 0.5465, + "step": 9447, + "task_loss": 0.7404565811157227 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5552453994750977, + "epoch": 7.99, + "learning_rate": 1.118624964778811e-05, + "loss": 0.7681, + "step": 9448, + "task_loss": 0.6044657230377197 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7909207940101624, + "epoch": 7.99, + "learning_rate": 1.1181553489245796e-05, + "loss": 0.7089, + "step": 9449, + "task_loss": 1.0200181007385254 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5152877569198608, + "epoch": 7.99, + "learning_rate": 1.1176857330703486e-05, + "loss": 0.5334, + "step": 9450, + "task_loss": 0.10435809940099716 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5352990627288818, + "epoch": 7.99, + "learning_rate": 1.1172161172161174e-05, + "loss": 0.5934, + "step": 9451, + "task_loss": 1.231873869895935 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.45269647240638733, + "epoch": 7.99, + "learning_rate": 1.116746501361886e-05, + "loss": 0.5244, + "step": 9452, + "task_loss": 0.865516722202301 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.45614904165267944, + "epoch": 7.99, + "learning_rate": 1.1162768855076548e-05, + "loss": 0.509, + "step": 9453, + "task_loss": 0.7096452713012695 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6409111022949219, + "epoch": 7.99, + "learning_rate": 1.1158072696534236e-05, + "loss": 0.5441, + "step": 9454, + "task_loss": 0.9649538993835449 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.48468875885009766, + "epoch": 7.99, + "learning_rate": 1.1153376537991923e-05, + "loss": 0.5363, + "step": 9455, + "task_loss": 1.204743504524231 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.526914119720459, + "epoch": 7.99, + "learning_rate": 1.114868037944961e-05, + "loss": 0.6603, + "step": 9456, + "task_loss": 0.28179535269737244 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5733225345611572, + "epoch": 7.99, + "learning_rate": 1.1143984220907299e-05, + "loss": 0.525, + "step": 9457, + "task_loss": 0.6432082056999207 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5511621236801147, + "epoch": 7.99, + "learning_rate": 1.1139288062364985e-05, + "loss": 0.6907, + "step": 9458, + "task_loss": 0.36878806352615356 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7930650115013123, + "epoch": 8.0, + "learning_rate": 1.1134591903822673e-05, + "loss": 0.6613, + "step": 9459, + "task_loss": 0.45087382197380066 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8984353542327881, + "epoch": 8.0, + "learning_rate": 1.1129895745280361e-05, + "loss": 0.7457, + "step": 9460, + "task_loss": 1.0850975513458252 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9980065226554871, + "epoch": 8.0, + "learning_rate": 1.112519958673805e-05, + "loss": 0.5874, + "step": 9461, + "task_loss": 1.310005784034729 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7816933393478394, + "epoch": 8.0, + "learning_rate": 1.1120503428195736e-05, + "loss": 0.6643, + "step": 9462, + "task_loss": 1.144230842590332 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9402523636817932, + "epoch": 8.0, + "learning_rate": 1.1115807269653424e-05, + "loss": 0.7737, + "step": 9463, + "task_loss": 1.4603557586669922 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8404320478439331, + "epoch": 8.0, + "learning_rate": 1.1111111111111112e-05, + "loss": 0.5656, + "step": 9464, + "task_loss": 0.6050480604171753 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5812202095985413, + "epoch": 8.0, + "learning_rate": 1.11064149525688e-05, + "loss": 1.291, + "step": 9465, + "task_loss": 0.38076305389404297 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.41187962889671326, + "epoch": 8.0, + "learning_rate": 1.1101718794026488e-05, + "loss": 0.4509, + "step": 9466, + "task_loss": 0.3408677279949188 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.4635920524597168, + "epoch": 8.0, + "learning_rate": 1.1097022635484176e-05, + "loss": 0.7054, + "step": 9467, + "task_loss": 1.1829211711883545 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.1287435293197632, + "epoch": 8.0, + "learning_rate": 1.1092326476941862e-05, + "loss": 0.7307, + "step": 9468, + "task_loss": 1.8518165349960327 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.27776604890823364, + "epoch": 8.0, + "learning_rate": 1.108763031839955e-05, + "loss": 0.5168, + "step": 9469, + "task_loss": 0.12676112353801727 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.407720685005188, + "epoch": 8.01, + "learning_rate": 1.1082934159857238e-05, + "loss": 0.5175, + "step": 9470, + "task_loss": 0.7011333703994751 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4935466945171356, + "epoch": 8.01, + "learning_rate": 1.1078238001314924e-05, + "loss": 0.5673, + "step": 9471, + "task_loss": 0.504375159740448 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7150142192840576, + "epoch": 8.01, + "learning_rate": 1.1073541842772613e-05, + "loss": 0.8321, + "step": 9472, + "task_loss": 0.6426998972892761 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5436955690383911, + "epoch": 8.01, + "learning_rate": 1.10688456842303e-05, + "loss": 0.5863, + "step": 9473, + "task_loss": 1.0703023672103882 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5467393398284912, + "epoch": 8.01, + "learning_rate": 1.1064149525687987e-05, + "loss": 0.8791, + "step": 9474, + "task_loss": 0.7123474478721619 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.562700092792511, + "epoch": 8.01, + "learning_rate": 1.1059453367145675e-05, + "loss": 0.7148, + "step": 9475, + "task_loss": 1.1184792518615723 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5672982335090637, + "epoch": 8.01, + "learning_rate": 1.1054757208603363e-05, + "loss": 0.6271, + "step": 9476, + "task_loss": 1.3319036960601807 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.49629098176956177, + "epoch": 8.01, + "learning_rate": 1.1050061050061051e-05, + "loss": 0.6934, + "step": 9477, + "task_loss": 0.33800455927848816 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9714409112930298, + "epoch": 8.01, + "learning_rate": 1.1045364891518737e-05, + "loss": 0.6578, + "step": 9478, + "task_loss": 0.5665557384490967 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7250206470489502, + "epoch": 8.01, + "learning_rate": 1.1040668732976425e-05, + "loss": 0.6071, + "step": 9479, + "task_loss": 0.6859468221664429 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7587070465087891, + "epoch": 8.01, + "learning_rate": 1.1035972574434113e-05, + "loss": 0.7226, + "step": 9480, + "task_loss": 0.8831332325935364 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7198452353477478, + "epoch": 8.01, + "learning_rate": 1.1031276415891801e-05, + "loss": 0.556, + "step": 9481, + "task_loss": 0.7043091058731079 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4940698444843292, + "epoch": 8.02, + "learning_rate": 1.102658025734949e-05, + "loss": 0.7, + "step": 9482, + "task_loss": 0.7922248840332031 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7235041260719299, + "epoch": 8.02, + "learning_rate": 1.1021884098807178e-05, + "loss": 0.6851, + "step": 9483, + "task_loss": 0.2755070924758911 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3108081817626953, + "epoch": 8.02, + "learning_rate": 1.1017187940264864e-05, + "loss": 0.3959, + "step": 9484, + "task_loss": 0.32028353214263916 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.704292893409729, + "epoch": 8.02, + "learning_rate": 1.1012491781722552e-05, + "loss": 0.5865, + "step": 9485, + "task_loss": 0.3405129015445709 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7406017184257507, + "epoch": 8.02, + "learning_rate": 1.100779562318024e-05, + "loss": 0.606, + "step": 9486, + "task_loss": 1.3421653509140015 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.47849640250205994, + "epoch": 8.02, + "learning_rate": 1.1003099464637926e-05, + "loss": 0.6357, + "step": 9487, + "task_loss": 0.3793576955795288 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5242279767990112, + "epoch": 8.02, + "learning_rate": 1.0998403306095614e-05, + "loss": 0.5402, + "step": 9488, + "task_loss": 0.5913048386573792 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8360837697982788, + "epoch": 8.02, + "learning_rate": 1.0993707147553302e-05, + "loss": 0.5956, + "step": 9489, + "task_loss": 1.5875080823898315 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6151258945465088, + "epoch": 8.02, + "learning_rate": 1.0989010989010989e-05, + "loss": 0.666, + "step": 9490, + "task_loss": 0.9413996338844299 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.795143723487854, + "epoch": 8.02, + "learning_rate": 1.0984314830468677e-05, + "loss": 0.6743, + "step": 9491, + "task_loss": 2.242873430252075 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0498865842819214, + "epoch": 8.02, + "learning_rate": 1.0979618671926365e-05, + "loss": 0.676, + "step": 9492, + "task_loss": 1.261332631111145 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5436716675758362, + "epoch": 8.02, + "learning_rate": 1.0974922513384053e-05, + "loss": 0.7626, + "step": 9493, + "task_loss": 0.6913396120071411 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5174266695976257, + "epoch": 8.03, + "learning_rate": 1.0970226354841739e-05, + "loss": 0.5278, + "step": 9494, + "task_loss": 0.11990638077259064 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6571503281593323, + "epoch": 8.03, + "learning_rate": 1.0965530196299427e-05, + "loss": 0.6977, + "step": 9495, + "task_loss": 0.8220389485359192 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6781460642814636, + "epoch": 8.03, + "learning_rate": 1.0960834037757115e-05, + "loss": 0.6446, + "step": 9496, + "task_loss": 0.45212236046791077 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.46204933524131775, + "epoch": 8.03, + "learning_rate": 1.0956137879214803e-05, + "loss": 0.5897, + "step": 9497, + "task_loss": 0.5921857953071594 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6566625237464905, + "epoch": 8.03, + "learning_rate": 1.0951441720672491e-05, + "loss": 0.5933, + "step": 9498, + "task_loss": 0.6926958560943604 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6501269340515137, + "epoch": 8.03, + "learning_rate": 1.094674556213018e-05, + "loss": 0.4954, + "step": 9499, + "task_loss": 0.42471814155578613 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5690914392471313, + "epoch": 8.03, + "learning_rate": 1.0942049403587866e-05, + "loss": 0.6174, + "step": 9500, + "task_loss": 0.7666858434677124 + }, + { + "epoch": 8.03, + "eval_accuracy": 0.9005148514851485, + "eval_loss": 0.39518916606903076, + "eval_runtime": 225.8436, + "eval_samples_per_second": 111.803, + "eval_steps_per_second": 0.877, + "step": 9500 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7263021469116211, + "epoch": 8.03, + "learning_rate": 1.0937353245045554e-05, + "loss": 0.5625, + "step": 9501, + "task_loss": 0.7129467725753784 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5475554466247559, + "epoch": 8.03, + "learning_rate": 1.0932657086503242e-05, + "loss": 0.7051, + "step": 9502, + "task_loss": 1.0951508283615112 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4801509380340576, + "epoch": 8.03, + "learning_rate": 1.0927960927960928e-05, + "loss": 0.5024, + "step": 9503, + "task_loss": 0.6476265788078308 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.993637204170227, + "epoch": 8.03, + "learning_rate": 1.0923264769418616e-05, + "loss": 0.6417, + "step": 9504, + "task_loss": 1.204645037651062 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6532274484634399, + "epoch": 8.03, + "learning_rate": 1.0918568610876304e-05, + "loss": 0.6253, + "step": 9505, + "task_loss": 0.7137248516082764 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5510648488998413, + "epoch": 8.04, + "learning_rate": 1.091387245233399e-05, + "loss": 0.6376, + "step": 9506, + "task_loss": 0.5058783292770386 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5566229820251465, + "epoch": 8.04, + "learning_rate": 1.0909176293791678e-05, + "loss": 0.6422, + "step": 9507, + "task_loss": 0.7207304835319519 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.129194974899292, + "epoch": 8.04, + "learning_rate": 1.0904480135249366e-05, + "loss": 0.8346, + "step": 9508, + "task_loss": 1.364959716796875 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.46848469972610474, + "epoch": 8.04, + "learning_rate": 1.0899783976707053e-05, + "loss": 0.5779, + "step": 9509, + "task_loss": 0.2549867630004883 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8912129402160645, + "epoch": 8.04, + "learning_rate": 1.0895087818164741e-05, + "loss": 0.712, + "step": 9510, + "task_loss": 1.5058979988098145 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3895356059074402, + "epoch": 8.04, + "learning_rate": 1.0890391659622429e-05, + "loss": 0.5678, + "step": 9511, + "task_loss": 0.8778238296508789 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3521405756473541, + "epoch": 8.04, + "learning_rate": 1.0885695501080117e-05, + "loss": 0.445, + "step": 9512, + "task_loss": 0.5191879272460938 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7175852060317993, + "epoch": 8.04, + "learning_rate": 1.0880999342537805e-05, + "loss": 0.5306, + "step": 9513, + "task_loss": 1.2599470615386963 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.40497875213623047, + "epoch": 8.04, + "learning_rate": 1.0876303183995493e-05, + "loss": 0.5988, + "step": 9514, + "task_loss": 0.7481816411018372 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7450413703918457, + "epoch": 8.04, + "learning_rate": 1.087160702545318e-05, + "loss": 0.7329, + "step": 9515, + "task_loss": 1.0181057453155518 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.41929566860198975, + "epoch": 8.04, + "learning_rate": 1.0866910866910867e-05, + "loss": 0.4814, + "step": 9516, + "task_loss": 0.6816752552986145 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3803497552871704, + "epoch": 8.04, + "learning_rate": 1.0862214708368555e-05, + "loss": 0.5811, + "step": 9517, + "task_loss": 0.42157575488090515 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7278377413749695, + "epoch": 8.05, + "learning_rate": 1.0857518549826243e-05, + "loss": 0.6044, + "step": 9518, + "task_loss": 1.1790415048599243 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.44540244340896606, + "epoch": 8.05, + "learning_rate": 1.085282239128393e-05, + "loss": 0.5197, + "step": 9519, + "task_loss": 0.8392729759216309 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7218919992446899, + "epoch": 8.05, + "learning_rate": 1.0848126232741618e-05, + "loss": 0.558, + "step": 9520, + "task_loss": 0.27950525283813477 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5292428731918335, + "epoch": 8.05, + "learning_rate": 1.0843430074199306e-05, + "loss": 0.6059, + "step": 9521, + "task_loss": 0.2518424987792969 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6718384623527527, + "epoch": 8.05, + "learning_rate": 1.0838733915656992e-05, + "loss": 0.8361, + "step": 9522, + "task_loss": 0.615452229976654 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4190441370010376, + "epoch": 8.05, + "learning_rate": 1.083403775711468e-05, + "loss": 0.6571, + "step": 9523, + "task_loss": 0.35550805926322937 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4793803095817566, + "epoch": 8.05, + "learning_rate": 1.0829341598572368e-05, + "loss": 0.7197, + "step": 9524, + "task_loss": 0.4837338924407959 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4783594012260437, + "epoch": 8.05, + "learning_rate": 1.0824645440030055e-05, + "loss": 0.5728, + "step": 9525, + "task_loss": 0.5893384218215942 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7351547479629517, + "epoch": 8.05, + "learning_rate": 1.0819949281487743e-05, + "loss": 0.6201, + "step": 9526, + "task_loss": 0.3935568034648895 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7646112442016602, + "epoch": 8.05, + "learning_rate": 1.0815253122945432e-05, + "loss": 0.5183, + "step": 9527, + "task_loss": 1.1995060443878174 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5851702690124512, + "epoch": 8.05, + "learning_rate": 1.0810556964403119e-05, + "loss": 0.4713, + "step": 9528, + "task_loss": 0.7927724719047546 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7064776420593262, + "epoch": 8.05, + "learning_rate": 1.0805860805860807e-05, + "loss": 0.6468, + "step": 9529, + "task_loss": 1.4219470024108887 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7205072641372681, + "epoch": 8.06, + "learning_rate": 1.0801164647318495e-05, + "loss": 0.6581, + "step": 9530, + "task_loss": 0.4179346561431885 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9075226783752441, + "epoch": 8.06, + "learning_rate": 1.0796468488776181e-05, + "loss": 0.7322, + "step": 9531, + "task_loss": 0.47756433486938477 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.1874046325683594, + "epoch": 8.06, + "learning_rate": 1.0791772330233869e-05, + "loss": 0.5494, + "step": 9532, + "task_loss": 0.40479543805122375 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.669325053691864, + "epoch": 8.06, + "learning_rate": 1.0787076171691557e-05, + "loss": 0.6081, + "step": 9533, + "task_loss": 0.966395378112793 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4969741702079773, + "epoch": 8.06, + "learning_rate": 1.0782380013149245e-05, + "loss": 0.6979, + "step": 9534, + "task_loss": 1.2446085214614868 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4613681435585022, + "epoch": 8.06, + "learning_rate": 1.0777683854606932e-05, + "loss": 0.4495, + "step": 9535, + "task_loss": 0.2641541659832001 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.32833755016326904, + "epoch": 8.06, + "learning_rate": 1.077298769606462e-05, + "loss": 0.5262, + "step": 9536, + "task_loss": 0.18245911598205566 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4848249554634094, + "epoch": 8.06, + "learning_rate": 1.0768291537522308e-05, + "loss": 0.3762, + "step": 9537, + "task_loss": 0.5912644863128662 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7036850452423096, + "epoch": 8.06, + "learning_rate": 1.0763595378979994e-05, + "loss": 0.4789, + "step": 9538, + "task_loss": 1.3571386337280273 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.843467652797699, + "epoch": 8.06, + "learning_rate": 1.0758899220437682e-05, + "loss": 0.5502, + "step": 9539, + "task_loss": 1.5222222805023193 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.42739373445510864, + "epoch": 8.06, + "learning_rate": 1.075420306189537e-05, + "loss": 0.6654, + "step": 9540, + "task_loss": 0.135125070810318 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.49046146869659424, + "epoch": 8.07, + "learning_rate": 1.0749506903353056e-05, + "loss": 0.6236, + "step": 9541, + "task_loss": 0.7644829750061035 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3661442697048187, + "epoch": 8.07, + "learning_rate": 1.0744810744810746e-05, + "loss": 0.6882, + "step": 9542, + "task_loss": 0.9548665881156921 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.47083649039268494, + "epoch": 8.07, + "learning_rate": 1.0740114586268434e-05, + "loss": 0.6607, + "step": 9543, + "task_loss": 0.3314148783683777 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5044400095939636, + "epoch": 8.07, + "learning_rate": 1.073541842772612e-05, + "loss": 0.62, + "step": 9544, + "task_loss": 0.40416666865348816 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5955731868743896, + "epoch": 8.07, + "learning_rate": 1.0730722269183808e-05, + "loss": 0.5837, + "step": 9545, + "task_loss": 0.4850604236125946 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9529322385787964, + "epoch": 8.07, + "learning_rate": 1.0726026110641497e-05, + "loss": 0.6436, + "step": 9546, + "task_loss": 1.1506404876708984 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6712486743927002, + "epoch": 8.07, + "learning_rate": 1.0721329952099183e-05, + "loss": 0.792, + "step": 9547, + "task_loss": 0.5607008337974548 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3535802960395813, + "epoch": 8.07, + "learning_rate": 1.0716633793556871e-05, + "loss": 0.5941, + "step": 9548, + "task_loss": 0.30362826585769653 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.633469820022583, + "epoch": 8.07, + "learning_rate": 1.0711937635014559e-05, + "loss": 0.649, + "step": 9549, + "task_loss": 0.999631941318512 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4113759398460388, + "epoch": 8.07, + "learning_rate": 1.0707241476472247e-05, + "loss": 0.5333, + "step": 9550, + "task_loss": 0.04083705693483353 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7012148499488831, + "epoch": 8.07, + "learning_rate": 1.0702545317929933e-05, + "loss": 0.5505, + "step": 9551, + "task_loss": 0.48087257146835327 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6547578573226929, + "epoch": 8.07, + "learning_rate": 1.0697849159387621e-05, + "loss": 0.4577, + "step": 9552, + "task_loss": 0.4722820520401001 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5986226201057434, + "epoch": 8.08, + "learning_rate": 1.069315300084531e-05, + "loss": 0.5541, + "step": 9553, + "task_loss": 0.39321789145469666 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.633654773235321, + "epoch": 8.08, + "learning_rate": 1.0688456842302996e-05, + "loss": 0.6088, + "step": 9554, + "task_loss": 0.75005042552948 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.1990816593170166, + "epoch": 8.08, + "learning_rate": 1.0683760683760684e-05, + "loss": 0.7673, + "step": 9555, + "task_loss": 1.1684298515319824 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.699962854385376, + "epoch": 8.08, + "learning_rate": 1.0679064525218372e-05, + "loss": 0.5458, + "step": 9556, + "task_loss": 0.5113999247550964 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.43545788526535034, + "epoch": 8.08, + "learning_rate": 1.0674368366676058e-05, + "loss": 0.522, + "step": 9557, + "task_loss": 0.41227102279663086 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.37495675683021545, + "epoch": 8.08, + "learning_rate": 1.0669672208133748e-05, + "loss": 0.4859, + "step": 9558, + "task_loss": 0.13670139014720917 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5536849498748779, + "epoch": 8.08, + "learning_rate": 1.0664976049591436e-05, + "loss": 0.6112, + "step": 9559, + "task_loss": 0.5380703210830688 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7537777423858643, + "epoch": 8.08, + "learning_rate": 1.0660279891049122e-05, + "loss": 0.6979, + "step": 9560, + "task_loss": 1.063680648803711 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6131018400192261, + "epoch": 8.08, + "learning_rate": 1.065558373250681e-05, + "loss": 0.5089, + "step": 9561, + "task_loss": 0.9307178854942322 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5359773635864258, + "epoch": 8.08, + "learning_rate": 1.0650887573964498e-05, + "loss": 0.7115, + "step": 9562, + "task_loss": 0.4723987579345703 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.42610296607017517, + "epoch": 8.08, + "learning_rate": 1.0646191415422185e-05, + "loss": 0.7135, + "step": 9563, + "task_loss": 1.3517779111862183 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5900095701217651, + "epoch": 8.08, + "learning_rate": 1.0641495256879873e-05, + "loss": 0.6825, + "step": 9564, + "task_loss": 0.8574258089065552 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6208919286727905, + "epoch": 8.09, + "learning_rate": 1.063679909833756e-05, + "loss": 0.662, + "step": 9565, + "task_loss": 0.9144099950790405 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5470881462097168, + "epoch": 8.09, + "learning_rate": 1.0632102939795249e-05, + "loss": 0.464, + "step": 9566, + "task_loss": 0.2717429995536804 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7977765798568726, + "epoch": 8.09, + "learning_rate": 1.0627406781252935e-05, + "loss": 0.697, + "step": 9567, + "task_loss": 1.1663471460342407 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6453373432159424, + "epoch": 8.09, + "learning_rate": 1.0622710622710623e-05, + "loss": 0.4949, + "step": 9568, + "task_loss": 1.1563379764556885 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.917698323726654, + "epoch": 8.09, + "learning_rate": 1.0618014464168311e-05, + "loss": 0.871, + "step": 9569, + "task_loss": 1.695438265800476 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4665974974632263, + "epoch": 8.09, + "learning_rate": 1.0613318305625997e-05, + "loss": 0.6905, + "step": 9570, + "task_loss": 1.4889096021652222 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.812059223651886, + "epoch": 8.09, + "learning_rate": 1.0608622147083686e-05, + "loss": 0.5566, + "step": 9571, + "task_loss": 1.3008863925933838 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5726824998855591, + "epoch": 8.09, + "learning_rate": 1.0603925988541374e-05, + "loss": 0.5527, + "step": 9572, + "task_loss": 0.5701005458831787 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7214691638946533, + "epoch": 8.09, + "learning_rate": 1.0599229829999062e-05, + "loss": 0.4372, + "step": 9573, + "task_loss": 1.3378514051437378 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.36166322231292725, + "epoch": 8.09, + "learning_rate": 1.059453367145675e-05, + "loss": 0.6361, + "step": 9574, + "task_loss": 0.2553742825984955 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.2937757968902588, + "epoch": 8.09, + "learning_rate": 1.0589837512914438e-05, + "loss": 0.779, + "step": 9575, + "task_loss": 1.1187267303466797 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5925964713096619, + "epoch": 8.09, + "learning_rate": 1.0585141354372124e-05, + "loss": 0.6434, + "step": 9576, + "task_loss": 0.45923542976379395 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.44675207138061523, + "epoch": 8.1, + "learning_rate": 1.0580445195829812e-05, + "loss": 0.4873, + "step": 9577, + "task_loss": 0.7855995297431946 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5561177730560303, + "epoch": 8.1, + "learning_rate": 1.05757490372875e-05, + "loss": 0.5384, + "step": 9578, + "task_loss": 0.683715283870697 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8120338320732117, + "epoch": 8.1, + "learning_rate": 1.0571052878745186e-05, + "loss": 0.8039, + "step": 9579, + "task_loss": 0.8938063979148865 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7195050716400146, + "epoch": 8.1, + "learning_rate": 1.0566356720202874e-05, + "loss": 0.515, + "step": 9580, + "task_loss": 0.35608747601509094 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.733194887638092, + "epoch": 8.1, + "learning_rate": 1.0561660561660562e-05, + "loss": 0.5969, + "step": 9581, + "task_loss": 1.2638418674468994 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3394720256328583, + "epoch": 8.1, + "learning_rate": 1.0556964403118249e-05, + "loss": 0.4765, + "step": 9582, + "task_loss": 0.8286453485488892 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4973260760307312, + "epoch": 8.1, + "learning_rate": 1.0552268244575937e-05, + "loss": 0.5202, + "step": 9583, + "task_loss": 0.8858888745307922 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7265393733978271, + "epoch": 8.1, + "learning_rate": 1.0547572086033625e-05, + "loss": 0.7132, + "step": 9584, + "task_loss": 0.34723055362701416 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.23681239783763885, + "epoch": 8.1, + "learning_rate": 1.0542875927491313e-05, + "loss": 0.5906, + "step": 9585, + "task_loss": 0.4672214984893799 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5347276926040649, + "epoch": 8.1, + "learning_rate": 1.0538179768949e-05, + "loss": 0.6742, + "step": 9586, + "task_loss": 1.4647562503814697 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7493748664855957, + "epoch": 8.1, + "learning_rate": 1.0533483610406687e-05, + "loss": 0.542, + "step": 9587, + "task_loss": 0.8830563426017761 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5776088237762451, + "epoch": 8.1, + "learning_rate": 1.0528787451864375e-05, + "loss": 0.7206, + "step": 9588, + "task_loss": 1.0398075580596924 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6529967784881592, + "epoch": 8.11, + "learning_rate": 1.0524091293322063e-05, + "loss": 0.7341, + "step": 9589, + "task_loss": 0.15972480177879333 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6593091487884521, + "epoch": 8.11, + "learning_rate": 1.0519395134779751e-05, + "loss": 0.6464, + "step": 9590, + "task_loss": 0.7615033388137817 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4179069995880127, + "epoch": 8.11, + "learning_rate": 1.051469897623744e-05, + "loss": 0.5787, + "step": 9591, + "task_loss": 0.5175882577896118 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6920779943466187, + "epoch": 8.11, + "learning_rate": 1.0510002817695126e-05, + "loss": 0.6553, + "step": 9592, + "task_loss": 1.5671223402023315 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5582391023635864, + "epoch": 8.11, + "learning_rate": 1.0505306659152814e-05, + "loss": 0.5605, + "step": 9593, + "task_loss": 0.9209933280944824 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3108524978160858, + "epoch": 8.11, + "learning_rate": 1.0500610500610502e-05, + "loss": 0.5813, + "step": 9594, + "task_loss": 0.39493125677108765 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7317145466804504, + "epoch": 8.11, + "learning_rate": 1.0495914342068188e-05, + "loss": 0.6907, + "step": 9595, + "task_loss": 0.8386626839637756 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6233494281768799, + "epoch": 8.11, + "learning_rate": 1.0491218183525876e-05, + "loss": 0.589, + "step": 9596, + "task_loss": 0.3758171498775482 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.431942880153656, + "epoch": 8.11, + "learning_rate": 1.0486522024983564e-05, + "loss": 0.4371, + "step": 9597, + "task_loss": 0.44773709774017334 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7049449682235718, + "epoch": 8.11, + "learning_rate": 1.048182586644125e-05, + "loss": 0.5808, + "step": 9598, + "task_loss": 0.33941659331321716 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.695440948009491, + "epoch": 8.11, + "learning_rate": 1.0477129707898939e-05, + "loss": 0.5957, + "step": 9599, + "task_loss": 0.2897320091724396 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4686157703399658, + "epoch": 8.11, + "learning_rate": 1.0472433549356627e-05, + "loss": 0.5629, + "step": 9600, + "task_loss": 0.8260859251022339 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5623164176940918, + "epoch": 8.12, + "learning_rate": 1.0467737390814315e-05, + "loss": 0.6657, + "step": 9601, + "task_loss": 1.1238728761672974 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.47302550077438354, + "epoch": 8.12, + "learning_rate": 1.0463041232272001e-05, + "loss": 0.4944, + "step": 9602, + "task_loss": 0.8181694149971008 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8031943440437317, + "epoch": 8.12, + "learning_rate": 1.0458345073729689e-05, + "loss": 0.9323, + "step": 9603, + "task_loss": 1.8125805854797363 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5306625962257385, + "epoch": 8.12, + "learning_rate": 1.0453648915187377e-05, + "loss": 0.4966, + "step": 9604, + "task_loss": 0.25914791226387024 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8246955871582031, + "epoch": 8.12, + "learning_rate": 1.0448952756645065e-05, + "loss": 0.5744, + "step": 9605, + "task_loss": 0.9749420285224915 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8620201945304871, + "epoch": 8.12, + "learning_rate": 1.0444256598102753e-05, + "loss": 0.6665, + "step": 9606, + "task_loss": 0.5471687316894531 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.44490599632263184, + "epoch": 8.12, + "learning_rate": 1.0439560439560441e-05, + "loss": 0.5803, + "step": 9607, + "task_loss": 0.4878123998641968 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.733820915222168, + "epoch": 8.12, + "learning_rate": 1.0434864281018128e-05, + "loss": 0.6365, + "step": 9608, + "task_loss": 0.6441925764083862 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.36357760429382324, + "epoch": 8.12, + "learning_rate": 1.0430168122475816e-05, + "loss": 0.5572, + "step": 9609, + "task_loss": 0.15739966928958893 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5703957080841064, + "epoch": 8.12, + "learning_rate": 1.0425471963933504e-05, + "loss": 0.4975, + "step": 9610, + "task_loss": 1.3545721769332886 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7515392303466797, + "epoch": 8.12, + "learning_rate": 1.042077580539119e-05, + "loss": 0.7212, + "step": 9611, + "task_loss": 0.6423227190971375 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.42978012561798096, + "epoch": 8.13, + "learning_rate": 1.0416079646848878e-05, + "loss": 0.5565, + "step": 9612, + "task_loss": 0.265924870967865 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4373977780342102, + "epoch": 8.13, + "learning_rate": 1.0411383488306566e-05, + "loss": 0.6478, + "step": 9613, + "task_loss": 0.6766417622566223 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5850145220756531, + "epoch": 8.13, + "learning_rate": 1.0406687329764252e-05, + "loss": 0.5635, + "step": 9614, + "task_loss": 0.09358639270067215 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0222697257995605, + "epoch": 8.13, + "learning_rate": 1.040199117122194e-05, + "loss": 0.6646, + "step": 9615, + "task_loss": 0.7159797549247742 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4077756404876709, + "epoch": 8.13, + "learning_rate": 1.0397295012679628e-05, + "loss": 0.4371, + "step": 9616, + "task_loss": 0.8624580502510071 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7308584451675415, + "epoch": 8.13, + "learning_rate": 1.0392598854137316e-05, + "loss": 0.7253, + "step": 9617, + "task_loss": 0.6728371977806091 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4798913598060608, + "epoch": 8.13, + "learning_rate": 1.0387902695595003e-05, + "loss": 0.5082, + "step": 9618, + "task_loss": 0.18188521265983582 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7289467453956604, + "epoch": 8.13, + "learning_rate": 1.0383206537052693e-05, + "loss": 0.7303, + "step": 9619, + "task_loss": 1.0255823135375977 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.515568733215332, + "epoch": 8.13, + "learning_rate": 1.0378510378510379e-05, + "loss": 0.5055, + "step": 9620, + "task_loss": 0.9581947326660156 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6622647643089294, + "epoch": 8.13, + "learning_rate": 1.0373814219968067e-05, + "loss": 0.6572, + "step": 9621, + "task_loss": 1.3260321617126465 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6470175385475159, + "epoch": 8.13, + "learning_rate": 1.0369118061425755e-05, + "loss": 0.6214, + "step": 9622, + "task_loss": 0.7599854469299316 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4801376461982727, + "epoch": 8.13, + "learning_rate": 1.0364421902883443e-05, + "loss": 0.5218, + "step": 9623, + "task_loss": 0.4328024089336395 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8643584251403809, + "epoch": 8.14, + "learning_rate": 1.035972574434113e-05, + "loss": 0.5556, + "step": 9624, + "task_loss": 1.362401008605957 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5558688044548035, + "epoch": 8.14, + "learning_rate": 1.0355029585798817e-05, + "loss": 0.6025, + "step": 9625, + "task_loss": 0.22913341224193573 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4091380834579468, + "epoch": 8.14, + "learning_rate": 1.0350333427256505e-05, + "loss": 0.4728, + "step": 9626, + "task_loss": 0.18021298944950104 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.43584200739860535, + "epoch": 8.14, + "learning_rate": 1.0345637268714192e-05, + "loss": 0.6078, + "step": 9627, + "task_loss": 0.9064238667488098 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.1762903332710266, + "epoch": 8.14, + "learning_rate": 1.034094111017188e-05, + "loss": 0.5923, + "step": 9628, + "task_loss": 0.01924740895628929 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7627092599868774, + "epoch": 8.14, + "learning_rate": 1.0336244951629568e-05, + "loss": 0.715, + "step": 9629, + "task_loss": 1.085907220840454 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.47081637382507324, + "epoch": 8.14, + "learning_rate": 1.0331548793087254e-05, + "loss": 0.5698, + "step": 9630, + "task_loss": 0.5714925527572632 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.2772781252861023, + "epoch": 8.14, + "learning_rate": 1.0326852634544942e-05, + "loss": 0.383, + "step": 9631, + "task_loss": 0.6393018960952759 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6902523636817932, + "epoch": 8.14, + "learning_rate": 1.032215647600263e-05, + "loss": 0.5624, + "step": 9632, + "task_loss": 0.3776966333389282 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.20791667699813843, + "epoch": 8.14, + "learning_rate": 1.0317460317460318e-05, + "loss": 0.48, + "step": 9633, + "task_loss": 0.14684775471687317 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7786867618560791, + "epoch": 8.14, + "learning_rate": 1.0312764158918005e-05, + "loss": 0.7661, + "step": 9634, + "task_loss": 1.068207025527954 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5359975099563599, + "epoch": 8.14, + "learning_rate": 1.0308068000375694e-05, + "loss": 0.535, + "step": 9635, + "task_loss": 0.7561880350112915 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.730118989944458, + "epoch": 8.15, + "learning_rate": 1.030337184183338e-05, + "loss": 0.7372, + "step": 9636, + "task_loss": 0.40356141328811646 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.835820198059082, + "epoch": 8.15, + "learning_rate": 1.0298675683291069e-05, + "loss": 0.6542, + "step": 9637, + "task_loss": 1.2195770740509033 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6262813806533813, + "epoch": 8.15, + "learning_rate": 1.0293979524748757e-05, + "loss": 0.5806, + "step": 9638, + "task_loss": 0.836056649684906 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4134715795516968, + "epoch": 8.15, + "learning_rate": 1.0289283366206445e-05, + "loss": 0.4763, + "step": 9639, + "task_loss": 1.5802605152130127 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.556692361831665, + "epoch": 8.15, + "learning_rate": 1.0284587207664131e-05, + "loss": 0.7114, + "step": 9640, + "task_loss": 0.5530619025230408 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5864953994750977, + "epoch": 8.15, + "learning_rate": 1.0279891049121819e-05, + "loss": 0.6382, + "step": 9641, + "task_loss": 0.8927907347679138 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5987105369567871, + "epoch": 8.15, + "learning_rate": 1.0275194890579507e-05, + "loss": 0.617, + "step": 9642, + "task_loss": 0.6005358695983887 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.39652904868125916, + "epoch": 8.15, + "learning_rate": 1.0270498732037193e-05, + "loss": 0.4898, + "step": 9643, + "task_loss": 1.6250534057617188 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7027376890182495, + "epoch": 8.15, + "learning_rate": 1.0265802573494881e-05, + "loss": 0.4717, + "step": 9644, + "task_loss": 0.7919668555259705 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5486020445823669, + "epoch": 8.15, + "learning_rate": 1.026110641495257e-05, + "loss": 0.6, + "step": 9645, + "task_loss": 0.6079891324043274 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4156404137611389, + "epoch": 8.15, + "learning_rate": 1.0256410256410256e-05, + "loss": 0.5382, + "step": 9646, + "task_loss": 0.1482173204421997 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.35263124108314514, + "epoch": 8.15, + "learning_rate": 1.0251714097867944e-05, + "loss": 0.6978, + "step": 9647, + "task_loss": 0.5375698208808899 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5329613089561462, + "epoch": 8.16, + "learning_rate": 1.0247017939325632e-05, + "loss": 0.5681, + "step": 9648, + "task_loss": 1.024849772453308 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6611464023590088, + "epoch": 8.16, + "learning_rate": 1.0242321780783318e-05, + "loss": 0.6055, + "step": 9649, + "task_loss": 0.10155653953552246 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6950768232345581, + "epoch": 8.16, + "learning_rate": 1.0237625622241008e-05, + "loss": 0.5142, + "step": 9650, + "task_loss": 0.3393682539463043 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.32113179564476013, + "epoch": 8.16, + "learning_rate": 1.0232929463698696e-05, + "loss": 0.5137, + "step": 9651, + "task_loss": 1.0396702289581299 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7534380555152893, + "epoch": 8.16, + "learning_rate": 1.0228233305156382e-05, + "loss": 0.5778, + "step": 9652, + "task_loss": 0.4444790780544281 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8492867946624756, + "epoch": 8.16, + "learning_rate": 1.022353714661407e-05, + "loss": 0.6517, + "step": 9653, + "task_loss": 0.8703060150146484 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7189669609069824, + "epoch": 8.16, + "learning_rate": 1.0218840988071758e-05, + "loss": 0.667, + "step": 9654, + "task_loss": 1.319606900215149 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6175704002380371, + "epoch": 8.16, + "learning_rate": 1.0214144829529446e-05, + "loss": 0.5485, + "step": 9655, + "task_loss": 0.6523827314376831 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.2409095764160156, + "epoch": 8.16, + "learning_rate": 1.0209448670987133e-05, + "loss": 0.7424, + "step": 9656, + "task_loss": 0.9884382486343384 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3530313968658447, + "epoch": 8.16, + "learning_rate": 1.020475251244482e-05, + "loss": 0.6038, + "step": 9657, + "task_loss": 0.48726996779441833 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3822699189186096, + "epoch": 8.16, + "learning_rate": 1.0200056353902509e-05, + "loss": 0.6504, + "step": 9658, + "task_loss": 0.11936576664447784 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5424160957336426, + "epoch": 8.16, + "learning_rate": 1.0195360195360195e-05, + "loss": 0.6503, + "step": 9659, + "task_loss": 0.7380337715148926 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5378329157829285, + "epoch": 8.17, + "learning_rate": 1.0190664036817883e-05, + "loss": 0.7351, + "step": 9660, + "task_loss": 1.6195659637451172 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5582855343818665, + "epoch": 8.17, + "learning_rate": 1.0185967878275571e-05, + "loss": 0.6436, + "step": 9661, + "task_loss": 1.624598503112793 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9149881601333618, + "epoch": 8.17, + "learning_rate": 1.0181271719733258e-05, + "loss": 0.5696, + "step": 9662, + "task_loss": 1.2571210861206055 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4464848041534424, + "epoch": 8.17, + "learning_rate": 1.0176575561190946e-05, + "loss": 0.5061, + "step": 9663, + "task_loss": 0.18248122930526733 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.571691632270813, + "epoch": 8.17, + "learning_rate": 1.0171879402648634e-05, + "loss": 0.5615, + "step": 9664, + "task_loss": 1.149930477142334 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4990137219429016, + "epoch": 8.17, + "learning_rate": 1.016718324410632e-05, + "loss": 0.6064, + "step": 9665, + "task_loss": 1.0181905031204224 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.477236807346344, + "epoch": 8.17, + "learning_rate": 1.016248708556401e-05, + "loss": 0.5367, + "step": 9666, + "task_loss": 1.084006905555725 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6068460941314697, + "epoch": 8.17, + "learning_rate": 1.0157790927021698e-05, + "loss": 0.5269, + "step": 9667, + "task_loss": 0.8979296088218689 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.450136661529541, + "epoch": 8.17, + "learning_rate": 1.0153094768479384e-05, + "loss": 0.5861, + "step": 9668, + "task_loss": 0.442084401845932 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5763944983482361, + "epoch": 8.17, + "learning_rate": 1.0148398609937072e-05, + "loss": 0.7372, + "step": 9669, + "task_loss": 0.5940802693367004 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.23386327922344208, + "epoch": 8.17, + "learning_rate": 1.014370245139476e-05, + "loss": 0.4681, + "step": 9670, + "task_loss": 0.6548717617988586 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.43092072010040283, + "epoch": 8.17, + "learning_rate": 1.0139006292852447e-05, + "loss": 0.7079, + "step": 9671, + "task_loss": 0.9332940578460693 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7766908407211304, + "epoch": 8.18, + "learning_rate": 1.0134310134310135e-05, + "loss": 0.6419, + "step": 9672, + "task_loss": 1.0150963068008423 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.46505236625671387, + "epoch": 8.18, + "learning_rate": 1.0129613975767823e-05, + "loss": 0.4794, + "step": 9673, + "task_loss": 0.6934946179389954 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9029327630996704, + "epoch": 8.18, + "learning_rate": 1.012491781722551e-05, + "loss": 0.5521, + "step": 9674, + "task_loss": 0.5102378726005554 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.366142600774765, + "epoch": 8.18, + "learning_rate": 1.0120221658683197e-05, + "loss": 0.3938, + "step": 9675, + "task_loss": 0.9380089044570923 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5105211734771729, + "epoch": 8.18, + "learning_rate": 1.0115525500140885e-05, + "loss": 0.8085, + "step": 9676, + "task_loss": 0.9707764387130737 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5969465970993042, + "epoch": 8.18, + "learning_rate": 1.0110829341598573e-05, + "loss": 0.4342, + "step": 9677, + "task_loss": 0.48047658801078796 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4193398356437683, + "epoch": 8.18, + "learning_rate": 1.010613318305626e-05, + "loss": 0.5674, + "step": 9678, + "task_loss": 0.8955860137939453 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4202116131782532, + "epoch": 8.18, + "learning_rate": 1.0101437024513947e-05, + "loss": 0.5814, + "step": 9679, + "task_loss": 0.3602074682712555 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6812478303909302, + "epoch": 8.18, + "learning_rate": 1.0096740865971635e-05, + "loss": 0.4915, + "step": 9680, + "task_loss": 0.14519837498664856 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.2566354274749756, + "epoch": 8.18, + "learning_rate": 1.0092044707429323e-05, + "loss": 0.8063, + "step": 9681, + "task_loss": 0.8407101035118103 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.36252814531326294, + "epoch": 8.18, + "learning_rate": 1.0087348548887012e-05, + "loss": 0.6194, + "step": 9682, + "task_loss": 0.3001737594604492 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.47411048412323, + "epoch": 8.19, + "learning_rate": 1.00826523903447e-05, + "loss": 0.4649, + "step": 9683, + "task_loss": 0.5101187229156494 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.47575801610946655, + "epoch": 8.19, + "learning_rate": 1.0077956231802386e-05, + "loss": 0.539, + "step": 9684, + "task_loss": 0.4000416398048401 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.49338841438293457, + "epoch": 8.19, + "learning_rate": 1.0073260073260074e-05, + "loss": 0.4154, + "step": 9685, + "task_loss": 1.0669198036193848 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5545761585235596, + "epoch": 8.19, + "learning_rate": 1.0068563914717762e-05, + "loss": 0.4591, + "step": 9686, + "task_loss": 0.8333578109741211 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7649699449539185, + "epoch": 8.19, + "learning_rate": 1.0063867756175448e-05, + "loss": 0.5717, + "step": 9687, + "task_loss": 0.8900883793830872 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.291098415851593, + "epoch": 8.19, + "learning_rate": 1.0059171597633136e-05, + "loss": 0.508, + "step": 9688, + "task_loss": 0.8146505355834961 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5616877675056458, + "epoch": 8.19, + "learning_rate": 1.0054475439090824e-05, + "loss": 0.6619, + "step": 9689, + "task_loss": 0.32093581557273865 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3483119010925293, + "epoch": 8.19, + "learning_rate": 1.0049779280548512e-05, + "loss": 0.5512, + "step": 9690, + "task_loss": 0.4091796278953552 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7300522327423096, + "epoch": 8.19, + "learning_rate": 1.0045083122006199e-05, + "loss": 0.654, + "step": 9691, + "task_loss": 0.897581934928894 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8546731472015381, + "epoch": 8.19, + "learning_rate": 1.0040386963463887e-05, + "loss": 0.7723, + "step": 9692, + "task_loss": 0.9717864394187927 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.42005980014801025, + "epoch": 8.19, + "learning_rate": 1.0035690804921575e-05, + "loss": 0.5727, + "step": 9693, + "task_loss": 0.2694462835788727 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5674551725387573, + "epoch": 8.19, + "learning_rate": 1.0030994646379261e-05, + "loss": 0.5771, + "step": 9694, + "task_loss": 0.4497809410095215 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6945803165435791, + "epoch": 8.2, + "learning_rate": 1.002629848783695e-05, + "loss": 0.5494, + "step": 9695, + "task_loss": 2.1089043617248535 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7188489437103271, + "epoch": 8.2, + "learning_rate": 1.0021602329294639e-05, + "loss": 0.6364, + "step": 9696, + "task_loss": 0.7523070573806763 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.2655196189880371, + "epoch": 8.2, + "learning_rate": 1.0016906170752325e-05, + "loss": 0.6131, + "step": 9697, + "task_loss": 0.3392711579799652 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3544926643371582, + "epoch": 8.2, + "learning_rate": 1.0012210012210013e-05, + "loss": 0.4978, + "step": 9698, + "task_loss": 0.45918214321136475 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.40462374687194824, + "epoch": 8.2, + "learning_rate": 1.0007513853667701e-05, + "loss": 0.4463, + "step": 9699, + "task_loss": 0.8340792059898376 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7607669830322266, + "epoch": 8.2, + "learning_rate": 1.0002817695125388e-05, + "loss": 0.612, + "step": 9700, + "task_loss": 0.6583400964736938 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7745248079299927, + "epoch": 8.2, + "learning_rate": 9.998121536583076e-06, + "loss": 0.5991, + "step": 9701, + "task_loss": 0.5977430939674377 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.557267427444458, + "epoch": 8.2, + "learning_rate": 9.993425378040764e-06, + "loss": 0.6087, + "step": 9702, + "task_loss": 0.6721687316894531 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6215983629226685, + "epoch": 8.2, + "learning_rate": 9.98872921949845e-06, + "loss": 0.622, + "step": 9703, + "task_loss": 0.8346000909805298 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6345351934432983, + "epoch": 8.2, + "learning_rate": 9.984033060956138e-06, + "loss": 0.5236, + "step": 9704, + "task_loss": 1.017532467842102 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6238666772842407, + "epoch": 8.2, + "learning_rate": 9.979336902413826e-06, + "loss": 0.6775, + "step": 9705, + "task_loss": 1.0134828090667725 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4055737853050232, + "epoch": 8.2, + "learning_rate": 9.974640743871514e-06, + "loss": 0.6405, + "step": 9706, + "task_loss": 1.0618314743041992 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3234702944755554, + "epoch": 8.21, + "learning_rate": 9.9699445853292e-06, + "loss": 0.489, + "step": 9707, + "task_loss": 0.4082378149032593 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.35542815923690796, + "epoch": 8.21, + "learning_rate": 9.965248426786889e-06, + "loss": 0.5287, + "step": 9708, + "task_loss": 0.22363245487213135 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4324008822441101, + "epoch": 8.21, + "learning_rate": 9.960552268244577e-06, + "loss": 0.6214, + "step": 9709, + "task_loss": 0.7141844034194946 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.24309559166431427, + "epoch": 8.21, + "learning_rate": 9.955856109702263e-06, + "loss": 0.5894, + "step": 9710, + "task_loss": 0.05265339836478233 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7792916297912598, + "epoch": 8.21, + "learning_rate": 9.951159951159951e-06, + "loss": 0.5879, + "step": 9711, + "task_loss": 0.6979584693908691 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4068969488143921, + "epoch": 8.21, + "learning_rate": 9.94646379261764e-06, + "loss": 0.4862, + "step": 9712, + "task_loss": 0.3067343533039093 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5120339393615723, + "epoch": 8.21, + "learning_rate": 9.941767634075327e-06, + "loss": 0.8402, + "step": 9713, + "task_loss": 0.6233176589012146 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.43113771080970764, + "epoch": 8.21, + "learning_rate": 9.937071475533015e-06, + "loss": 0.6232, + "step": 9714, + "task_loss": 0.5898086428642273 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.665696918964386, + "epoch": 8.21, + "learning_rate": 9.932375316990703e-06, + "loss": 0.595, + "step": 9715, + "task_loss": 0.6677228808403015 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.2720430791378021, + "epoch": 8.21, + "learning_rate": 9.92767915844839e-06, + "loss": 0.4782, + "step": 9716, + "task_loss": 0.2785189747810364 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.609301745891571, + "epoch": 8.21, + "learning_rate": 9.922982999906077e-06, + "loss": 0.6168, + "step": 9717, + "task_loss": 0.6175473928451538 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5398749113082886, + "epoch": 8.21, + "learning_rate": 9.918286841363765e-06, + "loss": 0.6454, + "step": 9718, + "task_loss": 0.8945716619491577 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.995951771736145, + "epoch": 8.22, + "learning_rate": 9.913590682821452e-06, + "loss": 0.6411, + "step": 9719, + "task_loss": 1.336043357849121 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7726734280586243, + "epoch": 8.22, + "learning_rate": 9.90889452427914e-06, + "loss": 0.5187, + "step": 9720, + "task_loss": 0.6701869368553162 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.562652587890625, + "epoch": 8.22, + "learning_rate": 9.904198365736828e-06, + "loss": 0.6282, + "step": 9721, + "task_loss": 1.326302409172058 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5632829666137695, + "epoch": 8.22, + "learning_rate": 9.899502207194516e-06, + "loss": 0.5057, + "step": 9722, + "task_loss": 0.7973195314407349 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9584172368049622, + "epoch": 8.22, + "learning_rate": 9.894806048652202e-06, + "loss": 0.6318, + "step": 9723, + "task_loss": 1.7145293951034546 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6352119445800781, + "epoch": 8.22, + "learning_rate": 9.89010989010989e-06, + "loss": 0.5344, + "step": 9724, + "task_loss": 0.5287065505981445 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.499528706073761, + "epoch": 8.22, + "learning_rate": 9.885413731567578e-06, + "loss": 0.5415, + "step": 9725, + "task_loss": 0.39163243770599365 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4306955337524414, + "epoch": 8.22, + "learning_rate": 9.880717573025265e-06, + "loss": 0.3998, + "step": 9726, + "task_loss": 0.2651369869709015 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4944876432418823, + "epoch": 8.22, + "learning_rate": 9.876021414482954e-06, + "loss": 0.4729, + "step": 9727, + "task_loss": 1.165845274925232 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6704347729682922, + "epoch": 8.22, + "learning_rate": 9.871325255940642e-06, + "loss": 0.6356, + "step": 9728, + "task_loss": 1.2839113473892212 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8214427828788757, + "epoch": 8.22, + "learning_rate": 9.866629097398329e-06, + "loss": 0.7324, + "step": 9729, + "task_loss": 0.5895955562591553 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.20682567358016968, + "epoch": 8.22, + "learning_rate": 9.861932938856017e-06, + "loss": 0.3554, + "step": 9730, + "task_loss": 0.5936257839202881 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4610383212566376, + "epoch": 8.23, + "learning_rate": 9.857236780313705e-06, + "loss": 0.5255, + "step": 9731, + "task_loss": 0.6102650165557861 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.32398366928100586, + "epoch": 8.23, + "learning_rate": 9.852540621771391e-06, + "loss": 0.4518, + "step": 9732, + "task_loss": 0.7745774984359741 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.445776104927063, + "epoch": 8.23, + "learning_rate": 9.84784446322908e-06, + "loss": 0.6412, + "step": 9733, + "task_loss": 0.5203332901000977 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.573090136051178, + "epoch": 8.23, + "learning_rate": 9.843148304686767e-06, + "loss": 0.5677, + "step": 9734, + "task_loss": 0.6419084668159485 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4735628068447113, + "epoch": 8.23, + "learning_rate": 9.838452146144454e-06, + "loss": 0.5953, + "step": 9735, + "task_loss": 0.3241812586784363 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5336918234825134, + "epoch": 8.23, + "learning_rate": 9.833755987602142e-06, + "loss": 0.7617, + "step": 9736, + "task_loss": 1.0960500240325928 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.40026795864105225, + "epoch": 8.23, + "learning_rate": 9.82905982905983e-06, + "loss": 0.6389, + "step": 9737, + "task_loss": 0.36770427227020264 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5087677836418152, + "epoch": 8.23, + "learning_rate": 9.824363670517516e-06, + "loss": 0.4718, + "step": 9738, + "task_loss": 1.3786020278930664 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.42891961336135864, + "epoch": 8.23, + "learning_rate": 9.819667511975204e-06, + "loss": 0.4519, + "step": 9739, + "task_loss": 1.9564852714538574 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7476685047149658, + "epoch": 8.23, + "learning_rate": 9.814971353432892e-06, + "loss": 0.6615, + "step": 9740, + "task_loss": 1.2797794342041016 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.49826616048812866, + "epoch": 8.23, + "learning_rate": 9.81027519489058e-06, + "loss": 0.6383, + "step": 9741, + "task_loss": 0.5478835105895996 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.45795324444770813, + "epoch": 8.23, + "learning_rate": 9.805579036348266e-06, + "loss": 0.4398, + "step": 9742, + "task_loss": 0.909438967704773 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6108505725860596, + "epoch": 8.24, + "learning_rate": 9.800882877805956e-06, + "loss": 0.5077, + "step": 9743, + "task_loss": 1.0625921487808228 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.328755646944046, + "epoch": 8.24, + "learning_rate": 9.796186719263643e-06, + "loss": 0.4961, + "step": 9744, + "task_loss": 0.2692311406135559 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.44167360663414, + "epoch": 8.24, + "learning_rate": 9.79149056072133e-06, + "loss": 0.4351, + "step": 9745, + "task_loss": 0.9758546352386475 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5998272895812988, + "epoch": 8.24, + "learning_rate": 9.786794402179019e-06, + "loss": 0.6467, + "step": 9746, + "task_loss": 0.7230185866355896 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6485182642936707, + "epoch": 8.24, + "learning_rate": 9.782098243636707e-06, + "loss": 0.6114, + "step": 9747, + "task_loss": 0.7181310057640076 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5027868747711182, + "epoch": 8.24, + "learning_rate": 9.777402085094393e-06, + "loss": 0.561, + "step": 9748, + "task_loss": 1.5743048191070557 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.272612065076828, + "epoch": 8.24, + "learning_rate": 9.772705926552081e-06, + "loss": 0.4502, + "step": 9749, + "task_loss": 0.5890927910804749 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5301039218902588, + "epoch": 8.24, + "learning_rate": 9.768009768009769e-06, + "loss": 0.4548, + "step": 9750, + "task_loss": 0.6430805325508118 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7540709972381592, + "epoch": 8.24, + "learning_rate": 9.763313609467455e-06, + "loss": 0.7002, + "step": 9751, + "task_loss": 1.4772731065750122 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3252166509628296, + "epoch": 8.24, + "learning_rate": 9.758617450925143e-06, + "loss": 0.493, + "step": 9752, + "task_loss": 0.5621254444122314 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5571260452270508, + "epoch": 8.24, + "learning_rate": 9.753921292382831e-06, + "loss": 0.5343, + "step": 9753, + "task_loss": 0.5988714694976807 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.38744351267814636, + "epoch": 8.24, + "learning_rate": 9.749225133840518e-06, + "loss": 0.5826, + "step": 9754, + "task_loss": 0.2282389998435974 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4892951250076294, + "epoch": 8.25, + "learning_rate": 9.744528975298206e-06, + "loss": 0.5114, + "step": 9755, + "task_loss": 1.217790961265564 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.523906946182251, + "epoch": 8.25, + "learning_rate": 9.739832816755894e-06, + "loss": 0.529, + "step": 9756, + "task_loss": 0.8651268482208252 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5354322195053101, + "epoch": 8.25, + "learning_rate": 9.735136658213582e-06, + "loss": 0.5072, + "step": 9757, + "task_loss": 1.184323787689209 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.42520955204963684, + "epoch": 8.25, + "learning_rate": 9.73044049967127e-06, + "loss": 0.4346, + "step": 9758, + "task_loss": 0.5469807982444763 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3210119605064392, + "epoch": 8.25, + "learning_rate": 9.725744341128958e-06, + "loss": 0.4141, + "step": 9759, + "task_loss": 0.3511040210723877 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.29372698068618774, + "epoch": 8.25, + "learning_rate": 9.721048182586644e-06, + "loss": 0.4049, + "step": 9760, + "task_loss": 0.18916447460651398 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5697832703590393, + "epoch": 8.25, + "learning_rate": 9.716352024044332e-06, + "loss": 0.5522, + "step": 9761, + "task_loss": 1.0560085773468018 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7433333396911621, + "epoch": 8.25, + "learning_rate": 9.71165586550202e-06, + "loss": 0.6382, + "step": 9762, + "task_loss": 0.6183957457542419 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4126461148262024, + "epoch": 8.25, + "learning_rate": 9.706959706959708e-06, + "loss": 0.5105, + "step": 9763, + "task_loss": 0.6432529091835022 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4319184422492981, + "epoch": 8.25, + "learning_rate": 9.702263548417395e-06, + "loss": 0.7527, + "step": 9764, + "task_loss": 0.5215303301811218 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.39973318576812744, + "epoch": 8.25, + "learning_rate": 9.697567389875083e-06, + "loss": 0.4376, + "step": 9765, + "task_loss": 0.6656894087791443 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5883221626281738, + "epoch": 8.26, + "learning_rate": 9.69287123133277e-06, + "loss": 0.7442, + "step": 9766, + "task_loss": 0.8088876008987427 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.40741658210754395, + "epoch": 8.26, + "learning_rate": 9.688175072790457e-06, + "loss": 0.504, + "step": 9767, + "task_loss": 0.08677873015403748 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.712173342704773, + "epoch": 8.26, + "learning_rate": 9.683478914248145e-06, + "loss": 0.6045, + "step": 9768, + "task_loss": 0.44167467951774597 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6513352990150452, + "epoch": 8.26, + "learning_rate": 9.678782755705833e-06, + "loss": 0.7382, + "step": 9769, + "task_loss": 0.48553502559661865 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7241733074188232, + "epoch": 8.26, + "learning_rate": 9.67408659716352e-06, + "loss": 0.7564, + "step": 9770, + "task_loss": 0.7730306386947632 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6585714817047119, + "epoch": 8.26, + "learning_rate": 9.669390438621208e-06, + "loss": 0.6848, + "step": 9771, + "task_loss": 0.9087777733802795 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6228389143943787, + "epoch": 8.26, + "learning_rate": 9.664694280078896e-06, + "loss": 0.6036, + "step": 9772, + "task_loss": 0.8109902739524841 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.33220309019088745, + "epoch": 8.26, + "learning_rate": 9.659998121536584e-06, + "loss": 0.4374, + "step": 9773, + "task_loss": 0.10776631534099579 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6427100896835327, + "epoch": 8.26, + "learning_rate": 9.655301962994272e-06, + "loss": 0.6378, + "step": 9774, + "task_loss": 0.683043360710144 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3917993903160095, + "epoch": 8.26, + "learning_rate": 9.65060580445196e-06, + "loss": 0.54, + "step": 9775, + "task_loss": 0.7201816439628601 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5498101711273193, + "epoch": 8.26, + "learning_rate": 9.645909645909646e-06, + "loss": 0.4773, + "step": 9776, + "task_loss": 0.8182201981544495 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.25872287154197693, + "epoch": 8.26, + "learning_rate": 9.641213487367334e-06, + "loss": 0.5072, + "step": 9777, + "task_loss": 0.514519214630127 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.51511549949646, + "epoch": 8.27, + "learning_rate": 9.636517328825022e-06, + "loss": 0.5276, + "step": 9778, + "task_loss": 0.7080508470535278 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4420640170574188, + "epoch": 8.27, + "learning_rate": 9.63182117028271e-06, + "loss": 0.6357, + "step": 9779, + "task_loss": 0.8679093718528748 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.27520108222961426, + "epoch": 8.27, + "learning_rate": 9.627125011740396e-06, + "loss": 0.5053, + "step": 9780, + "task_loss": 0.05081824213266373 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5846917629241943, + "epoch": 8.27, + "learning_rate": 9.622428853198085e-06, + "loss": 0.6561, + "step": 9781, + "task_loss": 0.25746241211891174 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.36217206716537476, + "epoch": 8.27, + "learning_rate": 9.617732694655773e-06, + "loss": 0.5371, + "step": 9782, + "task_loss": 0.45610925555229187 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4579867124557495, + "epoch": 8.27, + "learning_rate": 9.613036536113459e-06, + "loss": 0.5528, + "step": 9783, + "task_loss": 1.120659589767456 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.758806586265564, + "epoch": 8.27, + "learning_rate": 9.608340377571147e-06, + "loss": 0.5567, + "step": 9784, + "task_loss": 1.1297321319580078 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.33021819591522217, + "epoch": 8.27, + "learning_rate": 9.603644219028835e-06, + "loss": 0.639, + "step": 9785, + "task_loss": 0.7937184572219849 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5953812003135681, + "epoch": 8.27, + "learning_rate": 9.598948060486521e-06, + "loss": 0.7165, + "step": 9786, + "task_loss": 0.6437649726867676 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6374558210372925, + "epoch": 8.27, + "learning_rate": 9.59425190194421e-06, + "loss": 0.6016, + "step": 9787, + "task_loss": 0.7424291968345642 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8017048835754395, + "epoch": 8.27, + "learning_rate": 9.589555743401897e-06, + "loss": 0.6367, + "step": 9788, + "task_loss": 0.5568118691444397 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8357959389686584, + "epoch": 8.27, + "learning_rate": 9.584859584859585e-06, + "loss": 0.6543, + "step": 9789, + "task_loss": 0.9232833385467529 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6738321781158447, + "epoch": 8.28, + "learning_rate": 9.580163426317273e-06, + "loss": 0.5385, + "step": 9790, + "task_loss": 0.7109473943710327 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4712334871292114, + "epoch": 8.28, + "learning_rate": 9.575467267774961e-06, + "loss": 0.7312, + "step": 9791, + "task_loss": 1.2037276029586792 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6693501472473145, + "epoch": 8.28, + "learning_rate": 9.570771109232648e-06, + "loss": 0.5202, + "step": 9792, + "task_loss": 2.1047422885894775 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9694004654884338, + "epoch": 8.28, + "learning_rate": 9.566074950690336e-06, + "loss": 0.6129, + "step": 9793, + "task_loss": 0.8538075685501099 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.43758994340896606, + "epoch": 8.28, + "learning_rate": 9.561378792148024e-06, + "loss": 0.5287, + "step": 9794, + "task_loss": 0.6865754127502441 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6472204923629761, + "epoch": 8.28, + "learning_rate": 9.556682633605712e-06, + "loss": 0.5687, + "step": 9795, + "task_loss": 1.2798367738723755 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8585596084594727, + "epoch": 8.28, + "learning_rate": 9.551986475063398e-06, + "loss": 0.8127, + "step": 9796, + "task_loss": 1.3665858507156372 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5408956408500671, + "epoch": 8.28, + "learning_rate": 9.547290316521086e-06, + "loss": 0.5115, + "step": 9797, + "task_loss": 0.6298164129257202 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.424552857875824, + "epoch": 8.28, + "learning_rate": 9.542594157978774e-06, + "loss": 0.3862, + "step": 9798, + "task_loss": 0.039411406964063644 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5289142727851868, + "epoch": 8.28, + "learning_rate": 9.53789799943646e-06, + "loss": 0.5061, + "step": 9799, + "task_loss": 0.5404196381568909 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.129946231842041, + "epoch": 8.28, + "learning_rate": 9.533201840894149e-06, + "loss": 0.6086, + "step": 9800, + "task_loss": 1.0877065658569336 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3574545681476593, + "epoch": 8.28, + "learning_rate": 9.528505682351837e-06, + "loss": 0.5131, + "step": 9801, + "task_loss": 0.9301601648330688 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5210535526275635, + "epoch": 8.29, + "learning_rate": 9.523809523809523e-06, + "loss": 0.512, + "step": 9802, + "task_loss": 0.9841170310974121 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.44675412774086, + "epoch": 8.29, + "learning_rate": 9.519113365267211e-06, + "loss": 0.5596, + "step": 9803, + "task_loss": 0.12356835603713989 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4505787789821625, + "epoch": 8.29, + "learning_rate": 9.5144172067249e-06, + "loss": 0.513, + "step": 9804, + "task_loss": 0.4944281280040741 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.526336133480072, + "epoch": 8.29, + "learning_rate": 9.509721048182587e-06, + "loss": 0.5685, + "step": 9805, + "task_loss": 0.44564059376716614 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5431277751922607, + "epoch": 8.29, + "learning_rate": 9.505024889640275e-06, + "loss": 0.482, + "step": 9806, + "task_loss": 1.199747920036316 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.34640926122665405, + "epoch": 8.29, + "learning_rate": 9.500328731097963e-06, + "loss": 0.6061, + "step": 9807, + "task_loss": 0.7217783331871033 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5892632007598877, + "epoch": 8.29, + "learning_rate": 9.49563257255565e-06, + "loss": 0.4795, + "step": 9808, + "task_loss": 0.410609632730484 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5585726499557495, + "epoch": 8.29, + "learning_rate": 9.490936414013338e-06, + "loss": 0.7405, + "step": 9809, + "task_loss": 1.2846821546554565 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7344493865966797, + "epoch": 8.29, + "learning_rate": 9.486240255471026e-06, + "loss": 0.465, + "step": 9810, + "task_loss": 0.8095296025276184 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4598012864589691, + "epoch": 8.29, + "learning_rate": 9.481544096928712e-06, + "loss": 0.4468, + "step": 9811, + "task_loss": 0.5539670586585999 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6377863883972168, + "epoch": 8.29, + "learning_rate": 9.4768479383864e-06, + "loss": 0.5779, + "step": 9812, + "task_loss": 0.6633439064025879 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.29402726888656616, + "epoch": 8.29, + "learning_rate": 9.472151779844088e-06, + "loss": 0.4129, + "step": 9813, + "task_loss": 0.21798260509967804 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4857614040374756, + "epoch": 8.3, + "learning_rate": 9.467455621301776e-06, + "loss": 0.6912, + "step": 9814, + "task_loss": 0.5932385325431824 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5968766212463379, + "epoch": 8.3, + "learning_rate": 9.462759462759462e-06, + "loss": 0.5771, + "step": 9815, + "task_loss": 0.247098907828331 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7497870326042175, + "epoch": 8.3, + "learning_rate": 9.45806330421715e-06, + "loss": 0.5713, + "step": 9816, + "task_loss": 1.2362128496170044 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.552804172039032, + "epoch": 8.3, + "learning_rate": 9.453367145674838e-06, + "loss": 0.609, + "step": 9817, + "task_loss": 0.1970565766096115 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.22541718184947968, + "epoch": 8.3, + "learning_rate": 9.448670987132525e-06, + "loss": 0.4929, + "step": 9818, + "task_loss": 1.2011363506317139 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5659181475639343, + "epoch": 8.3, + "learning_rate": 9.443974828590213e-06, + "loss": 0.6198, + "step": 9819, + "task_loss": 0.4159109890460968 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.23107323050498962, + "epoch": 8.3, + "learning_rate": 9.439278670047903e-06, + "loss": 0.5945, + "step": 9820, + "task_loss": 0.8887677192687988 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7545822858810425, + "epoch": 8.3, + "learning_rate": 9.434582511505589e-06, + "loss": 0.6004, + "step": 9821, + "task_loss": 1.0762418508529663 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6824984550476074, + "epoch": 8.3, + "learning_rate": 9.429886352963277e-06, + "loss": 0.7768, + "step": 9822, + "task_loss": 1.316476821899414 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7736609578132629, + "epoch": 8.3, + "learning_rate": 9.425190194420965e-06, + "loss": 0.6033, + "step": 9823, + "task_loss": 0.5471997857093811 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7312540411949158, + "epoch": 8.3, + "learning_rate": 9.420494035878651e-06, + "loss": 0.549, + "step": 9824, + "task_loss": 0.6482584476470947 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.42153728008270264, + "epoch": 8.3, + "learning_rate": 9.41579787733634e-06, + "loss": 0.5171, + "step": 9825, + "task_loss": 0.5863528847694397 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6940416693687439, + "epoch": 8.31, + "learning_rate": 9.411101718794027e-06, + "loss": 0.573, + "step": 9826, + "task_loss": 0.5613134503364563 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5054451823234558, + "epoch": 8.31, + "learning_rate": 9.406405560251714e-06, + "loss": 0.4822, + "step": 9827, + "task_loss": 0.2842039465904236 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6866984367370605, + "epoch": 8.31, + "learning_rate": 9.401709401709402e-06, + "loss": 0.7226, + "step": 9828, + "task_loss": 1.023725152015686 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3244076371192932, + "epoch": 8.31, + "learning_rate": 9.39701324316709e-06, + "loss": 0.5121, + "step": 9829, + "task_loss": 0.550408124923706 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5420448184013367, + "epoch": 8.31, + "learning_rate": 9.392317084624778e-06, + "loss": 0.4586, + "step": 9830, + "task_loss": 0.6780598163604736 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.32724741101264954, + "epoch": 8.31, + "learning_rate": 9.387620926082464e-06, + "loss": 0.6314, + "step": 9831, + "task_loss": 0.6496874690055847 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3571416735649109, + "epoch": 8.31, + "learning_rate": 9.382924767540152e-06, + "loss": 0.3585, + "step": 9832, + "task_loss": 0.686508297920227 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.701358437538147, + "epoch": 8.31, + "learning_rate": 9.37822860899784e-06, + "loss": 0.5044, + "step": 9833, + "task_loss": 0.7468867897987366 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5182033777236938, + "epoch": 8.31, + "learning_rate": 9.373532450455527e-06, + "loss": 0.6396, + "step": 9834, + "task_loss": 0.3869730830192566 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3870936334133148, + "epoch": 8.31, + "learning_rate": 9.368836291913216e-06, + "loss": 0.5163, + "step": 9835, + "task_loss": 0.39969202876091003 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6727232933044434, + "epoch": 8.31, + "learning_rate": 9.364140133370904e-06, + "loss": 0.6262, + "step": 9836, + "task_loss": 0.6602930426597595 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7136795520782471, + "epoch": 8.32, + "learning_rate": 9.35944397482859e-06, + "loss": 0.6357, + "step": 9837, + "task_loss": 0.7449325919151306 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5230521559715271, + "epoch": 8.32, + "learning_rate": 9.354747816286279e-06, + "loss": 0.6123, + "step": 9838, + "task_loss": 0.8682950735092163 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5134536027908325, + "epoch": 8.32, + "learning_rate": 9.350051657743967e-06, + "loss": 0.5807, + "step": 9839, + "task_loss": 0.5762718915939331 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7198781371116638, + "epoch": 8.32, + "learning_rate": 9.345355499201653e-06, + "loss": 0.6157, + "step": 9840, + "task_loss": 1.0532041788101196 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7158334851264954, + "epoch": 8.32, + "learning_rate": 9.340659340659341e-06, + "loss": 0.4403, + "step": 9841, + "task_loss": 1.0027846097946167 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.46309924125671387, + "epoch": 8.32, + "learning_rate": 9.33596318211703e-06, + "loss": 0.4605, + "step": 9842, + "task_loss": 0.4700089693069458 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.57912278175354, + "epoch": 8.32, + "learning_rate": 9.331267023574715e-06, + "loss": 0.6323, + "step": 9843, + "task_loss": 0.8427464962005615 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3779534101486206, + "epoch": 8.32, + "learning_rate": 9.326570865032404e-06, + "loss": 0.4724, + "step": 9844, + "task_loss": 0.33405959606170654 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.35224449634552, + "epoch": 8.32, + "learning_rate": 9.321874706490092e-06, + "loss": 0.7443, + "step": 9845, + "task_loss": 0.4259592294692993 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.47084662318229675, + "epoch": 8.32, + "learning_rate": 9.31717854794778e-06, + "loss": 0.5708, + "step": 9846, + "task_loss": 0.9425650238990784 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7551038861274719, + "epoch": 8.32, + "learning_rate": 9.312482389405466e-06, + "loss": 0.6328, + "step": 9847, + "task_loss": 0.5099654793739319 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4672972559928894, + "epoch": 8.32, + "learning_rate": 9.307786230863154e-06, + "loss": 0.4261, + "step": 9848, + "task_loss": 0.5409045815467834 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.2891404628753662, + "epoch": 8.33, + "learning_rate": 9.303090072320842e-06, + "loss": 0.7567, + "step": 9849, + "task_loss": 1.220738172531128 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9625993371009827, + "epoch": 8.33, + "learning_rate": 9.298393913778528e-06, + "loss": 0.5627, + "step": 9850, + "task_loss": 1.092787265777588 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5076558589935303, + "epoch": 8.33, + "learning_rate": 9.293697755236218e-06, + "loss": 0.5747, + "step": 9851, + "task_loss": 0.4467574954032898 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4465334117412567, + "epoch": 8.33, + "learning_rate": 9.289001596693906e-06, + "loss": 0.4568, + "step": 9852, + "task_loss": 0.4624335467815399 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9595004320144653, + "epoch": 8.33, + "learning_rate": 9.284305438151592e-06, + "loss": 0.7431, + "step": 9853, + "task_loss": 0.8254464864730835 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8989959359169006, + "epoch": 8.33, + "learning_rate": 9.27960927960928e-06, + "loss": 0.6264, + "step": 9854, + "task_loss": 1.0959153175354004 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6846670508384705, + "epoch": 8.33, + "learning_rate": 9.274913121066969e-06, + "loss": 0.545, + "step": 9855, + "task_loss": 0.7316596508026123 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5249243974685669, + "epoch": 8.33, + "learning_rate": 9.270216962524655e-06, + "loss": 0.485, + "step": 9856, + "task_loss": 0.15920794010162354 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9239523410797119, + "epoch": 8.33, + "learning_rate": 9.265520803982343e-06, + "loss": 0.5713, + "step": 9857, + "task_loss": 0.8075742721557617 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.410297155380249, + "epoch": 8.33, + "learning_rate": 9.260824645440031e-06, + "loss": 0.5242, + "step": 9858, + "task_loss": 0.8583146929740906 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7461534738540649, + "epoch": 8.33, + "learning_rate": 9.256128486897717e-06, + "loss": 0.5406, + "step": 9859, + "task_loss": 0.5877596139907837 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6504094004631042, + "epoch": 8.33, + "learning_rate": 9.251432328355405e-06, + "loss": 0.6921, + "step": 9860, + "task_loss": 1.1524240970611572 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4042786657810211, + "epoch": 8.34, + "learning_rate": 9.246736169813093e-06, + "loss": 0.5406, + "step": 9861, + "task_loss": 0.09624804556369781 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3460994362831116, + "epoch": 8.34, + "learning_rate": 9.242040011270781e-06, + "loss": 0.6901, + "step": 9862, + "task_loss": 0.6882423758506775 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6399083137512207, + "epoch": 8.34, + "learning_rate": 9.237343852728468e-06, + "loss": 0.5531, + "step": 9863, + "task_loss": 0.6773644685745239 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5624182820320129, + "epoch": 8.34, + "learning_rate": 9.232647694186156e-06, + "loss": 0.5945, + "step": 9864, + "task_loss": 0.47147059440612793 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.2829819917678833, + "epoch": 8.34, + "learning_rate": 9.227951535643844e-06, + "loss": 0.4822, + "step": 9865, + "task_loss": 0.18068411946296692 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7106842398643494, + "epoch": 8.34, + "learning_rate": 9.223255377101532e-06, + "loss": 0.6952, + "step": 9866, + "task_loss": 0.5687665343284607 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.444815993309021, + "epoch": 8.34, + "learning_rate": 9.21855921855922e-06, + "loss": 0.5274, + "step": 9867, + "task_loss": 0.10060250014066696 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6980248689651489, + "epoch": 8.34, + "learning_rate": 9.213863060016908e-06, + "loss": 0.5721, + "step": 9868, + "task_loss": 0.705528199672699 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.43110477924346924, + "epoch": 8.34, + "learning_rate": 9.209166901474594e-06, + "loss": 0.4188, + "step": 9869, + "task_loss": 0.20876123011112213 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3453638553619385, + "epoch": 8.34, + "learning_rate": 9.204470742932282e-06, + "loss": 0.504, + "step": 9870, + "task_loss": 0.4599919021129608 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.32639408111572266, + "epoch": 8.34, + "learning_rate": 9.19977458438997e-06, + "loss": 0.3325, + "step": 9871, + "task_loss": 0.9029172658920288 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6806925535202026, + "epoch": 8.34, + "learning_rate": 9.195078425847657e-06, + "loss": 0.6599, + "step": 9872, + "task_loss": 0.5950111150741577 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5843950510025024, + "epoch": 8.35, + "learning_rate": 9.190382267305345e-06, + "loss": 0.6397, + "step": 9873, + "task_loss": 1.0508968830108643 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.41063106060028076, + "epoch": 8.35, + "learning_rate": 9.185686108763033e-06, + "loss": 0.7784, + "step": 9874, + "task_loss": 0.635343611240387 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7256381511688232, + "epoch": 8.35, + "learning_rate": 9.180989950220719e-06, + "loss": 0.6194, + "step": 9875, + "task_loss": 2.0023581981658936 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5069853067398071, + "epoch": 8.35, + "learning_rate": 9.176293791678407e-06, + "loss": 0.4845, + "step": 9876, + "task_loss": 0.10335472971200943 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.43046170473098755, + "epoch": 8.35, + "learning_rate": 9.171597633136095e-06, + "loss": 0.5375, + "step": 9877, + "task_loss": 0.4157020151615143 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.533750057220459, + "epoch": 8.35, + "learning_rate": 9.166901474593783e-06, + "loss": 0.4624, + "step": 9878, + "task_loss": 0.6177086234092712 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.38084644079208374, + "epoch": 8.35, + "learning_rate": 9.16220531605147e-06, + "loss": 0.3739, + "step": 9879, + "task_loss": 0.1281488537788391 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.22021713852882385, + "epoch": 8.35, + "learning_rate": 9.157509157509158e-06, + "loss": 0.4644, + "step": 9880, + "task_loss": 0.27022141218185425 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4078410565853119, + "epoch": 8.35, + "learning_rate": 9.152812998966846e-06, + "loss": 0.5868, + "step": 9881, + "task_loss": 0.31821632385253906 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5515609979629517, + "epoch": 8.35, + "learning_rate": 9.148116840424534e-06, + "loss": 0.4623, + "step": 9882, + "task_loss": 0.4838623106479645 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8344786167144775, + "epoch": 8.35, + "learning_rate": 9.143420681882222e-06, + "loss": 0.581, + "step": 9883, + "task_loss": 1.0338114500045776 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.602864146232605, + "epoch": 8.35, + "learning_rate": 9.13872452333991e-06, + "loss": 0.6864, + "step": 9884, + "task_loss": 0.6836211085319519 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.44459831714630127, + "epoch": 8.36, + "learning_rate": 9.134028364797596e-06, + "loss": 0.5839, + "step": 9885, + "task_loss": 0.7362571954727173 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.546707034111023, + "epoch": 8.36, + "learning_rate": 9.129332206255284e-06, + "loss": 0.4967, + "step": 9886, + "task_loss": 0.5246952772140503 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7502166032791138, + "epoch": 8.36, + "learning_rate": 9.124636047712972e-06, + "loss": 0.5536, + "step": 9887, + "task_loss": 1.1476266384124756 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6215810179710388, + "epoch": 8.36, + "learning_rate": 9.119939889170658e-06, + "loss": 0.7639, + "step": 9888, + "task_loss": 0.9441189765930176 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7858877182006836, + "epoch": 8.36, + "learning_rate": 9.115243730628346e-06, + "loss": 0.7488, + "step": 9889, + "task_loss": 0.33262619376182556 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4562780261039734, + "epoch": 8.36, + "learning_rate": 9.110547572086034e-06, + "loss": 0.6376, + "step": 9890, + "task_loss": 0.5660277605056763 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4226152300834656, + "epoch": 8.36, + "learning_rate": 9.10585141354372e-06, + "loss": 0.4095, + "step": 9891, + "task_loss": 0.3288862407207489 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8253062963485718, + "epoch": 8.36, + "learning_rate": 9.101155255001409e-06, + "loss": 0.6344, + "step": 9892, + "task_loss": 0.7457794547080994 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5540324449539185, + "epoch": 8.36, + "learning_rate": 9.096459096459097e-06, + "loss": 0.5542, + "step": 9893, + "task_loss": 0.6821209788322449 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.41184282302856445, + "epoch": 8.36, + "learning_rate": 9.091762937916783e-06, + "loss": 0.4721, + "step": 9894, + "task_loss": 0.49386656284332275 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5689901113510132, + "epoch": 8.36, + "learning_rate": 9.087066779374471e-06, + "loss": 0.5003, + "step": 9895, + "task_loss": 0.5945345759391785 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.2851574718952179, + "epoch": 8.36, + "learning_rate": 9.08237062083216e-06, + "loss": 0.5832, + "step": 9896, + "task_loss": 0.5732107758522034 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.2507196068763733, + "epoch": 8.37, + "learning_rate": 9.077674462289847e-06, + "loss": 0.4895, + "step": 9897, + "task_loss": 0.691851794719696 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4416786730289459, + "epoch": 8.37, + "learning_rate": 9.072978303747535e-06, + "loss": 0.7029, + "step": 9898, + "task_loss": 0.5714308023452759 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9821386337280273, + "epoch": 8.37, + "learning_rate": 9.068282145205223e-06, + "loss": 0.6547, + "step": 9899, + "task_loss": 0.3928053677082062 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6818063259124756, + "epoch": 8.37, + "learning_rate": 9.06358598666291e-06, + "loss": 0.7906, + "step": 9900, + "task_loss": 1.0516163110733032 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8048753142356873, + "epoch": 8.37, + "learning_rate": 9.058889828120598e-06, + "loss": 0.6706, + "step": 9901, + "task_loss": 0.9235149025917053 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7325043082237244, + "epoch": 8.37, + "learning_rate": 9.054193669578286e-06, + "loss": 0.7559, + "step": 9902, + "task_loss": 0.9463815689086914 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.39544016122817993, + "epoch": 8.37, + "learning_rate": 9.049497511035974e-06, + "loss": 0.469, + "step": 9903, + "task_loss": 0.15386584401130676 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4801464080810547, + "epoch": 8.37, + "learning_rate": 9.04480135249366e-06, + "loss": 0.5932, + "step": 9904, + "task_loss": 0.28420203924179077 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5612540245056152, + "epoch": 8.37, + "learning_rate": 9.040105193951348e-06, + "loss": 0.5011, + "step": 9905, + "task_loss": 0.660342812538147 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.47385135293006897, + "epoch": 8.37, + "learning_rate": 9.035409035409036e-06, + "loss": 0.5964, + "step": 9906, + "task_loss": 1.402239203453064 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4128328263759613, + "epoch": 8.37, + "learning_rate": 9.030712876866723e-06, + "loss": 0.5044, + "step": 9907, + "task_loss": 0.3803658187389374 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4382241368293762, + "epoch": 8.38, + "learning_rate": 9.02601671832441e-06, + "loss": 0.4267, + "step": 9908, + "task_loss": 0.8853191137313843 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.34184759855270386, + "epoch": 8.38, + "learning_rate": 9.021320559782099e-06, + "loss": 0.4768, + "step": 9909, + "task_loss": 0.7819085717201233 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5643121004104614, + "epoch": 8.38, + "learning_rate": 9.016624401239785e-06, + "loss": 0.5003, + "step": 9910, + "task_loss": 0.42143404483795166 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7201863527297974, + "epoch": 8.38, + "learning_rate": 9.011928242697473e-06, + "loss": 0.5546, + "step": 9911, + "task_loss": 1.3284950256347656 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5839823484420776, + "epoch": 8.38, + "learning_rate": 9.007232084155163e-06, + "loss": 0.5652, + "step": 9912, + "task_loss": 0.33166030049324036 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.38592734932899475, + "epoch": 8.38, + "learning_rate": 9.002535925612849e-06, + "loss": 0.4959, + "step": 9913, + "task_loss": 1.1307443380355835 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.640953779220581, + "epoch": 8.38, + "learning_rate": 8.997839767070537e-06, + "loss": 0.7092, + "step": 9914, + "task_loss": 0.7431163787841797 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.31718337535858154, + "epoch": 8.38, + "learning_rate": 8.993143608528225e-06, + "loss": 0.4287, + "step": 9915, + "task_loss": 0.12039118260145187 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5788823962211609, + "epoch": 8.38, + "learning_rate": 8.988447449985911e-06, + "loss": 0.5043, + "step": 9916, + "task_loss": 0.3690899610519409 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3280255198478699, + "epoch": 8.38, + "learning_rate": 8.9837512914436e-06, + "loss": 0.6785, + "step": 9917, + "task_loss": 0.3083937466144562 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.46402707695961, + "epoch": 8.38, + "learning_rate": 8.979055132901288e-06, + "loss": 0.675, + "step": 9918, + "task_loss": 0.46206536889076233 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9173400402069092, + "epoch": 8.38, + "learning_rate": 8.974358974358976e-06, + "loss": 0.5719, + "step": 9919, + "task_loss": 1.5207709074020386 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7511264681816101, + "epoch": 8.39, + "learning_rate": 8.969662815816662e-06, + "loss": 0.6149, + "step": 9920, + "task_loss": 1.3965877294540405 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5534853935241699, + "epoch": 8.39, + "learning_rate": 8.96496665727435e-06, + "loss": 0.4452, + "step": 9921, + "task_loss": 0.5477822422981262 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3337644636631012, + "epoch": 8.39, + "learning_rate": 8.960270498732038e-06, + "loss": 0.6024, + "step": 9922, + "task_loss": 0.3655329644680023 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3146843910217285, + "epoch": 8.39, + "learning_rate": 8.955574340189724e-06, + "loss": 0.5224, + "step": 9923, + "task_loss": 0.2205556333065033 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.373489111661911, + "epoch": 8.39, + "learning_rate": 8.950878181647412e-06, + "loss": 0.5001, + "step": 9924, + "task_loss": 0.5447070598602295 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.909636378288269, + "epoch": 8.39, + "learning_rate": 8.9461820231051e-06, + "loss": 0.7326, + "step": 9925, + "task_loss": 0.8775274157524109 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4476146399974823, + "epoch": 8.39, + "learning_rate": 8.941485864562787e-06, + "loss": 0.5477, + "step": 9926, + "task_loss": 0.972174346446991 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5034589767456055, + "epoch": 8.39, + "learning_rate": 8.936789706020475e-06, + "loss": 0.6389, + "step": 9927, + "task_loss": 0.5106055736541748 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.43141257762908936, + "epoch": 8.39, + "learning_rate": 8.932093547478164e-06, + "loss": 0.5481, + "step": 9928, + "task_loss": 0.8714046478271484 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4261969327926636, + "epoch": 8.39, + "learning_rate": 8.92739738893585e-06, + "loss": 0.5818, + "step": 9929, + "task_loss": 0.08750349283218384 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7194413542747498, + "epoch": 8.39, + "learning_rate": 8.922701230393539e-06, + "loss": 0.5487, + "step": 9930, + "task_loss": 0.5924075245857239 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6696881651878357, + "epoch": 8.39, + "learning_rate": 8.918005071851227e-06, + "loss": 0.5661, + "step": 9931, + "task_loss": 0.5555426478385925 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5446509122848511, + "epoch": 8.4, + "learning_rate": 8.913308913308913e-06, + "loss": 0.6505, + "step": 9932, + "task_loss": 0.372895210981369 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4515734314918518, + "epoch": 8.4, + "learning_rate": 8.908612754766601e-06, + "loss": 0.5423, + "step": 9933, + "task_loss": 0.5833816528320312 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.2785947918891907, + "epoch": 8.4, + "learning_rate": 8.90391659622429e-06, + "loss": 0.5032, + "step": 9934, + "task_loss": 0.4557320475578308 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.30781638622283936, + "epoch": 8.4, + "learning_rate": 8.899220437681977e-06, + "loss": 0.466, + "step": 9935, + "task_loss": 0.4561280906200409 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.29733866453170776, + "epoch": 8.4, + "learning_rate": 8.894524279139664e-06, + "loss": 0.4854, + "step": 9936, + "task_loss": 0.12015135586261749 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.46201521158218384, + "epoch": 8.4, + "learning_rate": 8.889828120597352e-06, + "loss": 0.4627, + "step": 9937, + "task_loss": 0.6367649435997009 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6993468403816223, + "epoch": 8.4, + "learning_rate": 8.88513196205504e-06, + "loss": 0.5773, + "step": 9938, + "task_loss": 0.6155622601509094 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4917026460170746, + "epoch": 8.4, + "learning_rate": 8.880435803512726e-06, + "loss": 0.5569, + "step": 9939, + "task_loss": 0.34616437554359436 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5455585718154907, + "epoch": 8.4, + "learning_rate": 8.875739644970414e-06, + "loss": 0.3921, + "step": 9940, + "task_loss": 0.39848724007606506 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5908718109130859, + "epoch": 8.4, + "learning_rate": 8.871043486428102e-06, + "loss": 0.4618, + "step": 9941, + "task_loss": 0.7051546573638916 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.38425591588020325, + "epoch": 8.4, + "learning_rate": 8.866347327885788e-06, + "loss": 0.623, + "step": 9942, + "task_loss": 0.6424261927604675 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4911694824695587, + "epoch": 8.4, + "learning_rate": 8.861651169343478e-06, + "loss": 0.5353, + "step": 9943, + "task_loss": 0.3585168421268463 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.2849941551685333, + "epoch": 8.41, + "learning_rate": 8.856955010801166e-06, + "loss": 0.4479, + "step": 9944, + "task_loss": 0.10542943328619003 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.47986310720443726, + "epoch": 8.41, + "learning_rate": 8.852258852258853e-06, + "loss": 0.4976, + "step": 9945, + "task_loss": 0.374702513217926 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5915270447731018, + "epoch": 8.41, + "learning_rate": 8.84756269371654e-06, + "loss": 0.7646, + "step": 9946, + "task_loss": 0.6874126195907593 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5676836371421814, + "epoch": 8.41, + "learning_rate": 8.842866535174229e-06, + "loss": 0.6468, + "step": 9947, + "task_loss": 0.8975026607513428 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4295034408569336, + "epoch": 8.41, + "learning_rate": 8.838170376631915e-06, + "loss": 0.7204, + "step": 9948, + "task_loss": 0.3749655783176422 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5758320689201355, + "epoch": 8.41, + "learning_rate": 8.833474218089603e-06, + "loss": 0.5394, + "step": 9949, + "task_loss": 1.805088758468628 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.32513272762298584, + "epoch": 8.41, + "learning_rate": 8.828778059547291e-06, + "loss": 0.4092, + "step": 9950, + "task_loss": 0.19630150496959686 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9707795977592468, + "epoch": 8.41, + "learning_rate": 8.824081901004979e-06, + "loss": 0.706, + "step": 9951, + "task_loss": 0.9660483002662659 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6217832565307617, + "epoch": 8.41, + "learning_rate": 8.819385742462665e-06, + "loss": 0.4895, + "step": 9952, + "task_loss": 0.42119812965393066 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3025924265384674, + "epoch": 8.41, + "learning_rate": 8.814689583920353e-06, + "loss": 0.4691, + "step": 9953, + "task_loss": 0.5430800914764404 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5399686098098755, + "epoch": 8.41, + "learning_rate": 8.809993425378042e-06, + "loss": 0.5621, + "step": 9954, + "task_loss": 1.139549970626831 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.2663201689720154, + "epoch": 8.41, + "learning_rate": 8.805297266835728e-06, + "loss": 0.5196, + "step": 9955, + "task_loss": 0.9680528044700623 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6568381786346436, + "epoch": 8.42, + "learning_rate": 8.800601108293416e-06, + "loss": 0.6026, + "step": 9956, + "task_loss": 1.3204718828201294 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5357798337936401, + "epoch": 8.42, + "learning_rate": 8.795904949751104e-06, + "loss": 0.4536, + "step": 9957, + "task_loss": 0.42011573910713196 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3213920593261719, + "epoch": 8.42, + "learning_rate": 8.791208791208792e-06, + "loss": 0.4888, + "step": 9958, + "task_loss": 0.9737111926078796 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5421769618988037, + "epoch": 8.42, + "learning_rate": 8.78651263266648e-06, + "loss": 0.5433, + "step": 9959, + "task_loss": 0.9650179743766785 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.20753741264343262, + "epoch": 8.42, + "learning_rate": 8.781816474124168e-06, + "loss": 0.4986, + "step": 9960, + "task_loss": 0.39006805419921875 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5970101356506348, + "epoch": 8.42, + "learning_rate": 8.777120315581854e-06, + "loss": 0.5135, + "step": 9961, + "task_loss": 0.3009363114833832 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6671257019042969, + "epoch": 8.42, + "learning_rate": 8.772424157039542e-06, + "loss": 0.6101, + "step": 9962, + "task_loss": 1.0379154682159424 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8711392879486084, + "epoch": 8.42, + "learning_rate": 8.76772799849723e-06, + "loss": 0.5779, + "step": 9963, + "task_loss": 1.4534447193145752 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.763811469078064, + "epoch": 8.42, + "learning_rate": 8.763031839954917e-06, + "loss": 0.8156, + "step": 9964, + "task_loss": 1.2084267139434814 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3617333769798279, + "epoch": 8.42, + "learning_rate": 8.758335681412605e-06, + "loss": 0.5307, + "step": 9965, + "task_loss": 0.9749069809913635 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7513591647148132, + "epoch": 8.42, + "learning_rate": 8.753639522870293e-06, + "loss": 0.6691, + "step": 9966, + "task_loss": 0.6057542562484741 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.661767840385437, + "epoch": 8.42, + "learning_rate": 8.74894336432798e-06, + "loss": 0.6703, + "step": 9967, + "task_loss": 0.678107738494873 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5381221175193787, + "epoch": 8.43, + "learning_rate": 8.744247205785667e-06, + "loss": 0.7359, + "step": 9968, + "task_loss": 1.9459682703018188 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6304236650466919, + "epoch": 8.43, + "learning_rate": 8.739551047243355e-06, + "loss": 0.4902, + "step": 9969, + "task_loss": 0.3533030152320862 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.367307186126709, + "epoch": 8.43, + "learning_rate": 8.734854888701043e-06, + "loss": 0.5298, + "step": 9970, + "task_loss": 1.1322321891784668 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.27767831087112427, + "epoch": 8.43, + "learning_rate": 8.73015873015873e-06, + "loss": 0.4663, + "step": 9971, + "task_loss": 0.13401243090629578 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3136594891548157, + "epoch": 8.43, + "learning_rate": 8.725462571616418e-06, + "loss": 0.5118, + "step": 9972, + "task_loss": 0.5296337008476257 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5264785885810852, + "epoch": 8.43, + "learning_rate": 8.720766413074106e-06, + "loss": 0.6614, + "step": 9973, + "task_loss": 0.9530460238456726 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5578907132148743, + "epoch": 8.43, + "learning_rate": 8.716070254531794e-06, + "loss": 0.5565, + "step": 9974, + "task_loss": 1.0055028200149536 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5714325904846191, + "epoch": 8.43, + "learning_rate": 8.711374095989482e-06, + "loss": 0.5785, + "step": 9975, + "task_loss": 0.47915226221084595 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.49294513463974, + "epoch": 8.43, + "learning_rate": 8.70667793744717e-06, + "loss": 0.5424, + "step": 9976, + "task_loss": 0.6642913818359375 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6583653688430786, + "epoch": 8.43, + "learning_rate": 8.701981778904856e-06, + "loss": 0.6671, + "step": 9977, + "task_loss": 1.0205702781677246 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7613810300827026, + "epoch": 8.43, + "learning_rate": 8.697285620362544e-06, + "loss": 0.5539, + "step": 9978, + "task_loss": 1.031827688217163 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5802471041679382, + "epoch": 8.44, + "learning_rate": 8.692589461820232e-06, + "loss": 0.6252, + "step": 9979, + "task_loss": 1.8595950603485107 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7182286977767944, + "epoch": 8.44, + "learning_rate": 8.687893303277919e-06, + "loss": 0.7738, + "step": 9980, + "task_loss": 1.348763108253479 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.42168959975242615, + "epoch": 8.44, + "learning_rate": 8.683197144735607e-06, + "loss": 0.4288, + "step": 9981, + "task_loss": 0.4145384132862091 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6082565784454346, + "epoch": 8.44, + "learning_rate": 8.678500986193295e-06, + "loss": 0.6529, + "step": 9982, + "task_loss": 1.3244729042053223 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.29717817902565, + "epoch": 8.44, + "learning_rate": 8.673804827650981e-06, + "loss": 0.4815, + "step": 9983, + "task_loss": 0.387093186378479 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6100088357925415, + "epoch": 8.44, + "learning_rate": 8.669108669108669e-06, + "loss": 0.6398, + "step": 9984, + "task_loss": 0.9132628440856934 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.47162073850631714, + "epoch": 8.44, + "learning_rate": 8.664412510566357e-06, + "loss": 0.6393, + "step": 9985, + "task_loss": 0.19711558520793915 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5330845713615417, + "epoch": 8.44, + "learning_rate": 8.659716352024045e-06, + "loss": 0.5794, + "step": 9986, + "task_loss": 0.6849272847175598 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8227648735046387, + "epoch": 8.44, + "learning_rate": 8.655020193481731e-06, + "loss": 0.6614, + "step": 9987, + "task_loss": 1.1872047185897827 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5226609706878662, + "epoch": 8.44, + "learning_rate": 8.65032403493942e-06, + "loss": 0.6185, + "step": 9988, + "task_loss": 1.0937620401382446 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.42052704095840454, + "epoch": 8.44, + "learning_rate": 8.645627876397107e-06, + "loss": 0.4709, + "step": 9989, + "task_loss": 0.5768736004829407 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5364763140678406, + "epoch": 8.44, + "learning_rate": 8.640931717854795e-06, + "loss": 0.5172, + "step": 9990, + "task_loss": 0.8856743574142456 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7680165767669678, + "epoch": 8.45, + "learning_rate": 8.636235559312484e-06, + "loss": 0.6657, + "step": 9991, + "task_loss": 0.682752251625061 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9503989219665527, + "epoch": 8.45, + "learning_rate": 8.631539400770172e-06, + "loss": 0.6391, + "step": 9992, + "task_loss": 0.8580178022384644 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.2980901300907135, + "epoch": 8.45, + "learning_rate": 8.626843242227858e-06, + "loss": 0.5618, + "step": 9993, + "task_loss": 0.16682946681976318 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6299120187759399, + "epoch": 8.45, + "learning_rate": 8.622147083685546e-06, + "loss": 0.5697, + "step": 9994, + "task_loss": 0.4846227765083313 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7331041097640991, + "epoch": 8.45, + "learning_rate": 8.617450925143234e-06, + "loss": 0.5717, + "step": 9995, + "task_loss": 0.7894541621208191 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.2859881818294525, + "epoch": 8.45, + "learning_rate": 8.61275476660092e-06, + "loss": 0.5429, + "step": 9996, + "task_loss": 0.28261926770210266 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5740426778793335, + "epoch": 8.45, + "learning_rate": 8.608058608058608e-06, + "loss": 0.7179, + "step": 9997, + "task_loss": 1.1668803691864014 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3266582190990448, + "epoch": 8.45, + "learning_rate": 8.603362449516296e-06, + "loss": 0.4764, + "step": 9998, + "task_loss": 0.42472049593925476 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9344483613967896, + "epoch": 8.45, + "learning_rate": 8.598666290973983e-06, + "loss": 0.5965, + "step": 9999, + "task_loss": 1.1691269874572754 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4198623299598694, + "epoch": 8.45, + "learning_rate": 8.59397013243167e-06, + "loss": 0.4921, + "step": 10000, + "task_loss": 0.4757460951805115 + }, + { + "epoch": 8.45, + "eval_accuracy": 0.9025742574257426, + "eval_loss": 0.37651845812797546, + "eval_runtime": 226.1659, + "eval_samples_per_second": 111.644, + "eval_steps_per_second": 0.875, + "step": 10000 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3017149567604065, + "epoch": 8.45, + "learning_rate": 8.589273973889359e-06, + "loss": 0.4227, + "step": 10001, + "task_loss": 0.7430132627487183 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5091423392295837, + "epoch": 8.45, + "learning_rate": 8.584577815347047e-06, + "loss": 0.4842, + "step": 10002, + "task_loss": 0.7173858880996704 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.38988274335861206, + "epoch": 8.46, + "learning_rate": 8.579881656804733e-06, + "loss": 0.4981, + "step": 10003, + "task_loss": 0.5944151878356934 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.348987340927124, + "epoch": 8.46, + "learning_rate": 8.575185498262421e-06, + "loss": 0.5361, + "step": 10004, + "task_loss": 0.27893880009651184 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5960767269134521, + "epoch": 8.46, + "learning_rate": 8.57048933972011e-06, + "loss": 0.6611, + "step": 10005, + "task_loss": 0.3596271574497223 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.45892226696014404, + "epoch": 8.46, + "learning_rate": 8.565793181177797e-06, + "loss": 0.4822, + "step": 10006, + "task_loss": 0.7849210500717163 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7698813080787659, + "epoch": 8.46, + "learning_rate": 8.561097022635485e-06, + "loss": 0.7169, + "step": 10007, + "task_loss": 0.26602986454963684 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.46376562118530273, + "epoch": 8.46, + "learning_rate": 8.556400864093173e-06, + "loss": 0.5563, + "step": 10008, + "task_loss": 0.5032181739807129 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.43545717000961304, + "epoch": 8.46, + "learning_rate": 8.55170470555086e-06, + "loss": 0.4423, + "step": 10009, + "task_loss": 0.3387356698513031 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4640302062034607, + "epoch": 8.46, + "learning_rate": 8.547008547008548e-06, + "loss": 0.5529, + "step": 10010, + "task_loss": 0.6506044268608093 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5258544683456421, + "epoch": 8.46, + "learning_rate": 8.542312388466236e-06, + "loss": 0.6224, + "step": 10011, + "task_loss": 0.5733548402786255 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9912153482437134, + "epoch": 8.46, + "learning_rate": 8.537616229923922e-06, + "loss": 0.7333, + "step": 10012, + "task_loss": 1.0554307699203491 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5356686115264893, + "epoch": 8.46, + "learning_rate": 8.53292007138161e-06, + "loss": 0.4813, + "step": 10013, + "task_loss": 1.0235570669174194 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.47651442885398865, + "epoch": 8.46, + "learning_rate": 8.528223912839298e-06, + "loss": 0.4859, + "step": 10014, + "task_loss": 1.3161373138427734 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.35693198442459106, + "epoch": 8.47, + "learning_rate": 8.523527754296984e-06, + "loss": 0.5586, + "step": 10015, + "task_loss": 0.4627190828323364 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8183948993682861, + "epoch": 8.47, + "learning_rate": 8.518831595754672e-06, + "loss": 0.6027, + "step": 10016, + "task_loss": 0.2677037715911865 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5472632646560669, + "epoch": 8.47, + "learning_rate": 8.51413543721236e-06, + "loss": 0.6024, + "step": 10017, + "task_loss": 0.48591431975364685 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.630576491355896, + "epoch": 8.47, + "learning_rate": 8.509439278670049e-06, + "loss": 0.6012, + "step": 10018, + "task_loss": 0.442399263381958 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.33686092495918274, + "epoch": 8.47, + "learning_rate": 8.504743120127735e-06, + "loss": 0.4132, + "step": 10019, + "task_loss": 0.2704259157180786 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.617949903011322, + "epoch": 8.47, + "learning_rate": 8.500046961585425e-06, + "loss": 0.636, + "step": 10020, + "task_loss": 0.15515285730361938 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6718248724937439, + "epoch": 8.47, + "learning_rate": 8.495350803043111e-06, + "loss": 0.4546, + "step": 10021, + "task_loss": 0.28392261266708374 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.31976184248924255, + "epoch": 8.47, + "learning_rate": 8.490654644500799e-06, + "loss": 0.5125, + "step": 10022, + "task_loss": 0.6169281005859375 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7102287411689758, + "epoch": 8.47, + "learning_rate": 8.485958485958487e-06, + "loss": 0.5228, + "step": 10023, + "task_loss": 1.1593163013458252 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4569820165634155, + "epoch": 8.47, + "learning_rate": 8.481262327416175e-06, + "loss": 0.7145, + "step": 10024, + "task_loss": 0.6015130877494812 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4448516368865967, + "epoch": 8.47, + "learning_rate": 8.476566168873861e-06, + "loss": 0.5036, + "step": 10025, + "task_loss": 0.8866956233978271 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9973589777946472, + "epoch": 8.47, + "learning_rate": 8.47187001033155e-06, + "loss": 0.6455, + "step": 10026, + "task_loss": 0.42801979184150696 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6565727591514587, + "epoch": 8.48, + "learning_rate": 8.467173851789237e-06, + "loss": 0.575, + "step": 10027, + "task_loss": 0.7306531667709351 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.520441472530365, + "epoch": 8.48, + "learning_rate": 8.462477693246924e-06, + "loss": 0.5139, + "step": 10028, + "task_loss": 0.6556795239448547 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6460514068603516, + "epoch": 8.48, + "learning_rate": 8.457781534704612e-06, + "loss": 0.7458, + "step": 10029, + "task_loss": 0.8729152679443359 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4478207230567932, + "epoch": 8.48, + "learning_rate": 8.4530853761623e-06, + "loss": 0.5118, + "step": 10030, + "task_loss": 0.461703360080719 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7045949697494507, + "epoch": 8.48, + "learning_rate": 8.448389217619986e-06, + "loss": 0.5852, + "step": 10031, + "task_loss": 1.1685397624969482 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.43410611152648926, + "epoch": 8.48, + "learning_rate": 8.443693059077674e-06, + "loss": 0.5269, + "step": 10032, + "task_loss": 0.7885587811470032 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7907038331031799, + "epoch": 8.48, + "learning_rate": 8.438996900535362e-06, + "loss": 0.6419, + "step": 10033, + "task_loss": 0.4274558424949646 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.31654179096221924, + "epoch": 8.48, + "learning_rate": 8.43430074199305e-06, + "loss": 0.578, + "step": 10034, + "task_loss": 0.5383250713348389 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.44962847232818604, + "epoch": 8.48, + "learning_rate": 8.429604583450738e-06, + "loss": 0.6329, + "step": 10035, + "task_loss": 0.593730628490448 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5202362537384033, + "epoch": 8.48, + "learning_rate": 8.424908424908426e-06, + "loss": 0.4515, + "step": 10036, + "task_loss": 0.7812157869338989 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4264167547225952, + "epoch": 8.48, + "learning_rate": 8.420212266366113e-06, + "loss": 0.5212, + "step": 10037, + "task_loss": 0.8531744480133057 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9380546808242798, + "epoch": 8.48, + "learning_rate": 8.4155161078238e-06, + "loss": 0.6167, + "step": 10038, + "task_loss": 0.6545056700706482 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6099013090133667, + "epoch": 8.49, + "learning_rate": 8.410819949281489e-06, + "loss": 0.529, + "step": 10039, + "task_loss": 1.4963222742080688 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6342105269432068, + "epoch": 8.49, + "learning_rate": 8.406123790739177e-06, + "loss": 0.5504, + "step": 10040, + "task_loss": 0.5106306672096252 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5394735932350159, + "epoch": 8.49, + "learning_rate": 8.401427632196863e-06, + "loss": 0.5927, + "step": 10041, + "task_loss": 0.3521052300930023 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.942297101020813, + "epoch": 8.49, + "learning_rate": 8.396731473654551e-06, + "loss": 0.752, + "step": 10042, + "task_loss": 1.205306053161621 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5761298537254333, + "epoch": 8.49, + "learning_rate": 8.39203531511224e-06, + "loss": 0.6953, + "step": 10043, + "task_loss": 1.1925389766693115 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6496156454086304, + "epoch": 8.49, + "learning_rate": 8.387339156569926e-06, + "loss": 0.5735, + "step": 10044, + "task_loss": 0.939959704875946 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9007594585418701, + "epoch": 8.49, + "learning_rate": 8.382642998027614e-06, + "loss": 0.767, + "step": 10045, + "task_loss": 0.8220113515853882 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7251944541931152, + "epoch": 8.49, + "learning_rate": 8.377946839485302e-06, + "loss": 0.5073, + "step": 10046, + "task_loss": 1.5946019887924194 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9293487071990967, + "epoch": 8.49, + "learning_rate": 8.373250680942988e-06, + "loss": 0.5007, + "step": 10047, + "task_loss": 0.633931577205658 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5057868957519531, + "epoch": 8.49, + "learning_rate": 8.368554522400676e-06, + "loss": 0.5622, + "step": 10048, + "task_loss": 0.7661393284797668 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.378428190946579, + "epoch": 8.49, + "learning_rate": 8.363858363858364e-06, + "loss": 0.5257, + "step": 10049, + "task_loss": 0.279744952917099 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7516874074935913, + "epoch": 8.5, + "learning_rate": 8.35916220531605e-06, + "loss": 0.6709, + "step": 10050, + "task_loss": 0.2894376218318939 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.40842193365097046, + "epoch": 8.5, + "learning_rate": 8.35446604677374e-06, + "loss": 0.4575, + "step": 10051, + "task_loss": 0.13369153439998627 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6151243448257446, + "epoch": 8.5, + "learning_rate": 8.349769888231428e-06, + "loss": 0.6891, + "step": 10052, + "task_loss": 0.6247830986976624 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5475522875785828, + "epoch": 8.5, + "learning_rate": 8.345073729689114e-06, + "loss": 0.5901, + "step": 10053, + "task_loss": 1.1218715906143188 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7991716861724854, + "epoch": 8.5, + "learning_rate": 8.340377571146803e-06, + "loss": 0.6654, + "step": 10054, + "task_loss": 0.6498648524284363 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7038369178771973, + "epoch": 8.5, + "learning_rate": 8.33568141260449e-06, + "loss": 0.5413, + "step": 10055, + "task_loss": 0.2541882395744324 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8381092548370361, + "epoch": 8.5, + "learning_rate": 8.330985254062177e-06, + "loss": 0.6499, + "step": 10056, + "task_loss": 1.004915714263916 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5427372455596924, + "epoch": 8.5, + "learning_rate": 8.326289095519865e-06, + "loss": 0.5748, + "step": 10057, + "task_loss": 1.2405643463134766 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.45553454756736755, + "epoch": 8.5, + "learning_rate": 8.321592936977553e-06, + "loss": 0.6636, + "step": 10058, + "task_loss": 0.754409909248352 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5653579235076904, + "epoch": 8.5, + "learning_rate": 8.316896778435241e-06, + "loss": 0.508, + "step": 10059, + "task_loss": 1.033448576927185 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.740765392780304, + "epoch": 8.5, + "learning_rate": 8.312200619892927e-06, + "loss": 0.5506, + "step": 10060, + "task_loss": 0.3072812259197235 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5614935159683228, + "epoch": 8.5, + "learning_rate": 8.307504461350615e-06, + "loss": 0.686, + "step": 10061, + "task_loss": 1.119915246963501 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4209464192390442, + "epoch": 8.51, + "learning_rate": 8.302808302808303e-06, + "loss": 0.4828, + "step": 10062, + "task_loss": 0.6324139833450317 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4343408942222595, + "epoch": 8.51, + "learning_rate": 8.29811214426599e-06, + "loss": 0.4699, + "step": 10063, + "task_loss": 0.14182300865650177 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.44105952978134155, + "epoch": 8.51, + "learning_rate": 8.293415985723678e-06, + "loss": 0.5473, + "step": 10064, + "task_loss": 0.4118945598602295 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8978383541107178, + "epoch": 8.51, + "learning_rate": 8.288719827181366e-06, + "loss": 0.6892, + "step": 10065, + "task_loss": 1.1369953155517578 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.27194952964782715, + "epoch": 8.51, + "learning_rate": 8.284023668639054e-06, + "loss": 0.6357, + "step": 10066, + "task_loss": 0.3206271827220917 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4990245997905731, + "epoch": 8.51, + "learning_rate": 8.279327510096742e-06, + "loss": 0.7337, + "step": 10067, + "task_loss": 0.34128084778785706 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.512772798538208, + "epoch": 8.51, + "learning_rate": 8.27463135155443e-06, + "loss": 0.6719, + "step": 10068, + "task_loss": 0.600698709487915 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.31321442127227783, + "epoch": 8.51, + "learning_rate": 8.269935193012116e-06, + "loss": 0.4469, + "step": 10069, + "task_loss": 0.5037088394165039 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3567098081111908, + "epoch": 8.51, + "learning_rate": 8.265239034469804e-06, + "loss": 0.452, + "step": 10070, + "task_loss": 0.938822329044342 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4847944378852844, + "epoch": 8.51, + "learning_rate": 8.260542875927492e-06, + "loss": 0.5789, + "step": 10071, + "task_loss": 0.44490042328834534 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5847632884979248, + "epoch": 8.51, + "learning_rate": 8.255846717385179e-06, + "loss": 0.5013, + "step": 10072, + "task_loss": 0.9204127788543701 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9299798011779785, + "epoch": 8.51, + "learning_rate": 8.251150558842867e-06, + "loss": 0.598, + "step": 10073, + "task_loss": 0.9591830372810364 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6519808769226074, + "epoch": 8.52, + "learning_rate": 8.246454400300555e-06, + "loss": 0.6216, + "step": 10074, + "task_loss": 0.8872933387756348 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3964068293571472, + "epoch": 8.52, + "learning_rate": 8.241758241758243e-06, + "loss": 0.5555, + "step": 10075, + "task_loss": 0.7764999866485596 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4139261841773987, + "epoch": 8.52, + "learning_rate": 8.237062083215929e-06, + "loss": 0.699, + "step": 10076, + "task_loss": 0.8838322162628174 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9584897756576538, + "epoch": 8.52, + "learning_rate": 8.232365924673617e-06, + "loss": 0.7196, + "step": 10077, + "task_loss": 0.9143103957176208 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5160485506057739, + "epoch": 8.52, + "learning_rate": 8.227669766131305e-06, + "loss": 0.5718, + "step": 10078, + "task_loss": 0.2763226330280304 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0263718366622925, + "epoch": 8.52, + "learning_rate": 8.222973607588992e-06, + "loss": 0.8316, + "step": 10079, + "task_loss": 0.9081021547317505 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7080621719360352, + "epoch": 8.52, + "learning_rate": 8.21827744904668e-06, + "loss": 0.4945, + "step": 10080, + "task_loss": 1.4055330753326416 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6522591710090637, + "epoch": 8.52, + "learning_rate": 8.213581290504368e-06, + "loss": 0.5695, + "step": 10081, + "task_loss": 0.3538375496864319 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3671731948852539, + "epoch": 8.52, + "learning_rate": 8.208885131962056e-06, + "loss": 0.6089, + "step": 10082, + "task_loss": 0.9851669073104858 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5110629200935364, + "epoch": 8.52, + "learning_rate": 8.204188973419744e-06, + "loss": 0.706, + "step": 10083, + "task_loss": 0.8025466203689575 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6208304166793823, + "epoch": 8.52, + "learning_rate": 8.199492814877432e-06, + "loss": 0.7029, + "step": 10084, + "task_loss": 0.7408548593521118 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5281844735145569, + "epoch": 8.52, + "learning_rate": 8.194796656335118e-06, + "loss": 0.6207, + "step": 10085, + "task_loss": 0.10437336564064026 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7888798713684082, + "epoch": 8.53, + "learning_rate": 8.190100497792806e-06, + "loss": 0.7803, + "step": 10086, + "task_loss": 1.1193461418151855 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.41526296734809875, + "epoch": 8.53, + "learning_rate": 8.185404339250494e-06, + "loss": 0.6, + "step": 10087, + "task_loss": 0.5470412373542786 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6257859468460083, + "epoch": 8.53, + "learning_rate": 8.18070818070818e-06, + "loss": 0.5082, + "step": 10088, + "task_loss": 1.1351027488708496 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3701359033584595, + "epoch": 8.53, + "learning_rate": 8.176012022165868e-06, + "loss": 0.4464, + "step": 10089, + "task_loss": 1.101163387298584 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5960867404937744, + "epoch": 8.53, + "learning_rate": 8.171315863623556e-06, + "loss": 0.5912, + "step": 10090, + "task_loss": 0.7456415891647339 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4089732766151428, + "epoch": 8.53, + "learning_rate": 8.166619705081245e-06, + "loss": 0.5515, + "step": 10091, + "task_loss": 1.0943807363510132 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5639589428901672, + "epoch": 8.53, + "learning_rate": 8.161923546538931e-06, + "loss": 0.6531, + "step": 10092, + "task_loss": 0.4710702896118164 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7174573540687561, + "epoch": 8.53, + "learning_rate": 8.157227387996619e-06, + "loss": 0.4588, + "step": 10093, + "task_loss": 0.32601064443588257 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.30598363280296326, + "epoch": 8.53, + "learning_rate": 8.152531229454307e-06, + "loss": 0.5002, + "step": 10094, + "task_loss": 0.17186155915260315 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.42604756355285645, + "epoch": 8.53, + "learning_rate": 8.147835070911993e-06, + "loss": 0.4912, + "step": 10095, + "task_loss": 0.2708716094493866 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5390046238899231, + "epoch": 8.53, + "learning_rate": 8.143138912369681e-06, + "loss": 0.5562, + "step": 10096, + "task_loss": 0.38354775309562683 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.43524056673049927, + "epoch": 8.53, + "learning_rate": 8.138442753827371e-06, + "loss": 0.5257, + "step": 10097, + "task_loss": 0.6912310719490051 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7595468163490295, + "epoch": 8.54, + "learning_rate": 8.133746595285057e-06, + "loss": 0.4796, + "step": 10098, + "task_loss": 1.2664991617202759 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.37137699127197266, + "epoch": 8.54, + "learning_rate": 8.129050436742745e-06, + "loss": 0.5173, + "step": 10099, + "task_loss": 0.7199193835258484 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3278194069862366, + "epoch": 8.54, + "learning_rate": 8.124354278200433e-06, + "loss": 0.3952, + "step": 10100, + "task_loss": 0.10992012172937393 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.594543993473053, + "epoch": 8.54, + "learning_rate": 8.11965811965812e-06, + "loss": 0.5559, + "step": 10101, + "task_loss": 0.48877090215682983 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4837808609008789, + "epoch": 8.54, + "learning_rate": 8.114961961115808e-06, + "loss": 0.5318, + "step": 10102, + "task_loss": 0.9113153219223022 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.32818490266799927, + "epoch": 8.54, + "learning_rate": 8.110265802573496e-06, + "loss": 0.6883, + "step": 10103, + "task_loss": 0.29340091347694397 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.38841214776039124, + "epoch": 8.54, + "learning_rate": 8.105569644031182e-06, + "loss": 0.5422, + "step": 10104, + "task_loss": 0.9321341514587402 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5602864027023315, + "epoch": 8.54, + "learning_rate": 8.10087348548887e-06, + "loss": 0.5682, + "step": 10105, + "task_loss": 0.8022961020469666 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.25351881980895996, + "epoch": 8.54, + "learning_rate": 8.096177326946558e-06, + "loss": 0.5816, + "step": 10106, + "task_loss": 0.11937905102968216 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7187318801879883, + "epoch": 8.54, + "learning_rate": 8.091481168404246e-06, + "loss": 0.7374, + "step": 10107, + "task_loss": 0.9248560667037964 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.833742618560791, + "epoch": 8.54, + "learning_rate": 8.086785009861933e-06, + "loss": 0.4526, + "step": 10108, + "task_loss": 0.6479013562202454 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6749410033226013, + "epoch": 8.54, + "learning_rate": 8.08208885131962e-06, + "loss": 0.6015, + "step": 10109, + "task_loss": 0.16060172021389008 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.518153727054596, + "epoch": 8.55, + "learning_rate": 8.077392692777309e-06, + "loss": 0.6341, + "step": 10110, + "task_loss": 0.3851993680000305 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.26090601086616516, + "epoch": 8.55, + "learning_rate": 8.072696534234995e-06, + "loss": 0.441, + "step": 10111, + "task_loss": 0.5293717980384827 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5353056192398071, + "epoch": 8.55, + "learning_rate": 8.068000375692685e-06, + "loss": 0.4437, + "step": 10112, + "task_loss": 0.6285352110862732 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4880635142326355, + "epoch": 8.55, + "learning_rate": 8.063304217150373e-06, + "loss": 0.5068, + "step": 10113, + "task_loss": 0.9870822429656982 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5263556241989136, + "epoch": 8.55, + "learning_rate": 8.058608058608059e-06, + "loss": 0.6587, + "step": 10114, + "task_loss": 1.6000876426696777 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7022629976272583, + "epoch": 8.55, + "learning_rate": 8.053911900065747e-06, + "loss": 0.5666, + "step": 10115, + "task_loss": 0.804175078868866 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5355299711227417, + "epoch": 8.55, + "learning_rate": 8.049215741523435e-06, + "loss": 0.6895, + "step": 10116, + "task_loss": 1.548316478729248 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.41749125719070435, + "epoch": 8.55, + "learning_rate": 8.044519582981122e-06, + "loss": 0.6628, + "step": 10117, + "task_loss": 2.3394904136657715 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.37052983045578003, + "epoch": 8.55, + "learning_rate": 8.03982342443881e-06, + "loss": 0.4131, + "step": 10118, + "task_loss": 0.9963799715042114 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.45209747552871704, + "epoch": 8.55, + "learning_rate": 8.035127265896498e-06, + "loss": 0.5505, + "step": 10119, + "task_loss": 0.5448580980300903 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.28125521540641785, + "epoch": 8.55, + "learning_rate": 8.030431107354184e-06, + "loss": 0.4333, + "step": 10120, + "task_loss": 0.7575242519378662 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5642576813697815, + "epoch": 8.56, + "learning_rate": 8.025734948811872e-06, + "loss": 0.6924, + "step": 10121, + "task_loss": 0.7886309623718262 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4827713966369629, + "epoch": 8.56, + "learning_rate": 8.02103879026956e-06, + "loss": 0.4791, + "step": 10122, + "task_loss": 0.13242892920970917 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.49208471179008484, + "epoch": 8.56, + "learning_rate": 8.016342631727246e-06, + "loss": 0.5081, + "step": 10123, + "task_loss": 0.7585548758506775 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5045320987701416, + "epoch": 8.56, + "learning_rate": 8.011646473184934e-06, + "loss": 0.4392, + "step": 10124, + "task_loss": 0.18711814284324646 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.27482688426971436, + "epoch": 8.56, + "learning_rate": 8.006950314642622e-06, + "loss": 0.506, + "step": 10125, + "task_loss": 0.0407571904361248 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6644451022148132, + "epoch": 8.56, + "learning_rate": 8.00225415610031e-06, + "loss": 0.5756, + "step": 10126, + "task_loss": 0.9353916049003601 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.2864559590816498, + "epoch": 8.56, + "learning_rate": 7.997557997557997e-06, + "loss": 0.4937, + "step": 10127, + "task_loss": 0.23424577713012695 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.329814612865448, + "epoch": 8.56, + "learning_rate": 7.992861839015687e-06, + "loss": 0.4879, + "step": 10128, + "task_loss": 0.12384038418531418 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3870355188846588, + "epoch": 8.56, + "learning_rate": 7.988165680473373e-06, + "loss": 0.509, + "step": 10129, + "task_loss": 0.6031726598739624 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.617840588092804, + "epoch": 8.56, + "learning_rate": 7.983469521931061e-06, + "loss": 0.6792, + "step": 10130, + "task_loss": 0.475353479385376 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.49035799503326416, + "epoch": 8.56, + "learning_rate": 7.978773363388749e-06, + "loss": 0.5407, + "step": 10131, + "task_loss": 0.2951951324939728 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7206923961639404, + "epoch": 8.56, + "learning_rate": 7.974077204846437e-06, + "loss": 0.5074, + "step": 10132, + "task_loss": 1.7934777736663818 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7381411790847778, + "epoch": 8.57, + "learning_rate": 7.969381046304123e-06, + "loss": 0.561, + "step": 10133, + "task_loss": 0.8269683718681335 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.46145227551460266, + "epoch": 8.57, + "learning_rate": 7.964684887761811e-06, + "loss": 0.6692, + "step": 10134, + "task_loss": 0.7193881273269653 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6436439156532288, + "epoch": 8.57, + "learning_rate": 7.9599887292195e-06, + "loss": 0.5558, + "step": 10135, + "task_loss": 0.4468909204006195 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.42139285802841187, + "epoch": 8.57, + "learning_rate": 7.955292570677186e-06, + "loss": 0.5849, + "step": 10136, + "task_loss": 0.8121824860572815 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.20668308436870575, + "epoch": 8.57, + "learning_rate": 7.950596412134874e-06, + "loss": 0.4875, + "step": 10137, + "task_loss": 0.1131717711687088 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6257491111755371, + "epoch": 8.57, + "learning_rate": 7.945900253592562e-06, + "loss": 0.4826, + "step": 10138, + "task_loss": 0.3529435396194458 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5894378423690796, + "epoch": 8.57, + "learning_rate": 7.941204095050248e-06, + "loss": 0.6008, + "step": 10139, + "task_loss": 0.9032185077667236 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3426850438117981, + "epoch": 8.57, + "learning_rate": 7.936507936507936e-06, + "loss": 0.6847, + "step": 10140, + "task_loss": 0.4977544844150543 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4478592574596405, + "epoch": 8.57, + "learning_rate": 7.931811777965624e-06, + "loss": 0.4915, + "step": 10141, + "task_loss": 0.621381938457489 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4900103807449341, + "epoch": 8.57, + "learning_rate": 7.927115619423312e-06, + "loss": 0.3349, + "step": 10142, + "task_loss": 0.5146083235740662 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.579024076461792, + "epoch": 8.57, + "learning_rate": 7.922419460881e-06, + "loss": 0.5346, + "step": 10143, + "task_loss": 0.6735717058181763 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3790475130081177, + "epoch": 8.57, + "learning_rate": 7.917723302338688e-06, + "loss": 0.4871, + "step": 10144, + "task_loss": 0.6251894235610962 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5460718870162964, + "epoch": 8.58, + "learning_rate": 7.913027143796375e-06, + "loss": 0.5633, + "step": 10145, + "task_loss": 0.3767307996749878 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.2993364930152893, + "epoch": 8.58, + "learning_rate": 7.908330985254063e-06, + "loss": 0.4737, + "step": 10146, + "task_loss": 0.21420060098171234 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9127258062362671, + "epoch": 8.58, + "learning_rate": 7.90363482671175e-06, + "loss": 0.4906, + "step": 10147, + "task_loss": 0.9015702605247498 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.42848777770996094, + "epoch": 8.58, + "learning_rate": 7.898938668169439e-06, + "loss": 0.6728, + "step": 10148, + "task_loss": 0.7727674245834351 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3619275987148285, + "epoch": 8.58, + "learning_rate": 7.894242509627125e-06, + "loss": 0.5521, + "step": 10149, + "task_loss": 0.8505379557609558 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8534202575683594, + "epoch": 8.58, + "learning_rate": 7.889546351084813e-06, + "loss": 0.5172, + "step": 10150, + "task_loss": 1.1356754302978516 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.675075113773346, + "epoch": 8.58, + "learning_rate": 7.884850192542501e-06, + "loss": 0.6709, + "step": 10151, + "task_loss": 0.39349278807640076 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5910518765449524, + "epoch": 8.58, + "learning_rate": 7.880154034000187e-06, + "loss": 0.5614, + "step": 10152, + "task_loss": 0.5043654441833496 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6680186986923218, + "epoch": 8.58, + "learning_rate": 7.875457875457876e-06, + "loss": 0.7377, + "step": 10153, + "task_loss": 1.233729600906372 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4576263427734375, + "epoch": 8.58, + "learning_rate": 7.870761716915564e-06, + "loss": 0.5735, + "step": 10154, + "task_loss": 0.28425025939941406 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6808176040649414, + "epoch": 8.58, + "learning_rate": 7.86606555837325e-06, + "loss": 0.7476, + "step": 10155, + "task_loss": 0.7247397303581238 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.43139755725860596, + "epoch": 8.58, + "learning_rate": 7.861369399830938e-06, + "loss": 0.502, + "step": 10156, + "task_loss": 0.44355565309524536 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4331018328666687, + "epoch": 8.59, + "learning_rate": 7.856673241288626e-06, + "loss": 0.612, + "step": 10157, + "task_loss": 0.12340562045574188 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4998604655265808, + "epoch": 8.59, + "learning_rate": 7.851977082746314e-06, + "loss": 0.5681, + "step": 10158, + "task_loss": 0.44427070021629333 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3837619721889496, + "epoch": 8.59, + "learning_rate": 7.847280924204002e-06, + "loss": 0.5324, + "step": 10159, + "task_loss": 0.5479989647865295 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.38312530517578125, + "epoch": 8.59, + "learning_rate": 7.84258476566169e-06, + "loss": 0.488, + "step": 10160, + "task_loss": 0.4153583347797394 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5629072785377502, + "epoch": 8.59, + "learning_rate": 7.837888607119376e-06, + "loss": 0.5462, + "step": 10161, + "task_loss": 0.45182278752326965 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6474666595458984, + "epoch": 8.59, + "learning_rate": 7.833192448577064e-06, + "loss": 0.6333, + "step": 10162, + "task_loss": 1.2355743646621704 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.42465245723724365, + "epoch": 8.59, + "learning_rate": 7.828496290034752e-06, + "loss": 0.5568, + "step": 10163, + "task_loss": 0.9515724778175354 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6284161806106567, + "epoch": 8.59, + "learning_rate": 7.82380013149244e-06, + "loss": 0.5225, + "step": 10164, + "task_loss": 0.5514670610427856 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6502808332443237, + "epoch": 8.59, + "learning_rate": 7.819103972950127e-06, + "loss": 0.533, + "step": 10165, + "task_loss": 0.7140481472015381 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3145908713340759, + "epoch": 8.59, + "learning_rate": 7.814407814407815e-06, + "loss": 0.4809, + "step": 10166, + "task_loss": 0.25902023911476135 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.43434396386146545, + "epoch": 8.59, + "learning_rate": 7.809711655865503e-06, + "loss": 0.434, + "step": 10167, + "task_loss": 0.42399272322654724 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.34219855070114136, + "epoch": 8.59, + "learning_rate": 7.80501549732319e-06, + "loss": 0.4421, + "step": 10168, + "task_loss": 0.46535056829452515 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6890006065368652, + "epoch": 8.6, + "learning_rate": 7.800319338780877e-06, + "loss": 0.6663, + "step": 10169, + "task_loss": 0.7872116565704346 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6199469566345215, + "epoch": 8.6, + "learning_rate": 7.795623180238565e-06, + "loss": 0.6704, + "step": 10170, + "task_loss": 1.1878571510314941 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5149555206298828, + "epoch": 8.6, + "learning_rate": 7.790927021696252e-06, + "loss": 0.5896, + "step": 10171, + "task_loss": 0.4224426746368408 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.526591420173645, + "epoch": 8.6, + "learning_rate": 7.78623086315394e-06, + "loss": 0.5096, + "step": 10172, + "task_loss": 0.5713741779327393 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.531842052936554, + "epoch": 8.6, + "learning_rate": 7.781534704611628e-06, + "loss": 0.6403, + "step": 10173, + "task_loss": 0.27138659358024597 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5089507699012756, + "epoch": 8.6, + "learning_rate": 7.776838546069316e-06, + "loss": 0.6328, + "step": 10174, + "task_loss": 1.2283568382263184 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.28154557943344116, + "epoch": 8.6, + "learning_rate": 7.772142387527004e-06, + "loss": 0.4644, + "step": 10175, + "task_loss": 0.23503310978412628 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.691821277141571, + "epoch": 8.6, + "learning_rate": 7.767446228984692e-06, + "loss": 0.6169, + "step": 10176, + "task_loss": 0.7231013178825378 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8408421277999878, + "epoch": 8.6, + "learning_rate": 7.762750070442378e-06, + "loss": 0.6201, + "step": 10177, + "task_loss": 0.5247149467468262 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4279100000858307, + "epoch": 8.6, + "learning_rate": 7.758053911900066e-06, + "loss": 0.5743, + "step": 10178, + "task_loss": 0.8061252236366272 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4396101236343384, + "epoch": 8.6, + "learning_rate": 7.753357753357754e-06, + "loss": 0.5312, + "step": 10179, + "task_loss": 0.214432954788208 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.34010961651802063, + "epoch": 8.6, + "learning_rate": 7.748661594815442e-06, + "loss": 0.6092, + "step": 10180, + "task_loss": 0.014869332313537598 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8323996067047119, + "epoch": 8.61, + "learning_rate": 7.743965436273129e-06, + "loss": 0.712, + "step": 10181, + "task_loss": 0.7657367587089539 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6642820835113525, + "epoch": 8.61, + "learning_rate": 7.739269277730817e-06, + "loss": 0.6302, + "step": 10182, + "task_loss": 0.9210270643234253 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3710958957672119, + "epoch": 8.61, + "learning_rate": 7.734573119188505e-06, + "loss": 0.3666, + "step": 10183, + "task_loss": 0.7601845264434814 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.48884764313697815, + "epoch": 8.61, + "learning_rate": 7.729876960646191e-06, + "loss": 0.6154, + "step": 10184, + "task_loss": 0.5419313311576843 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6243571639060974, + "epoch": 8.61, + "learning_rate": 7.725180802103879e-06, + "loss": 0.5249, + "step": 10185, + "task_loss": 1.172163724899292 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5880721807479858, + "epoch": 8.61, + "learning_rate": 7.720484643561567e-06, + "loss": 0.4962, + "step": 10186, + "task_loss": 0.8736922740936279 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.40281474590301514, + "epoch": 8.61, + "learning_rate": 7.715788485019253e-06, + "loss": 0.5119, + "step": 10187, + "task_loss": 0.9788747429847717 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4510864019393921, + "epoch": 8.61, + "learning_rate": 7.711092326476941e-06, + "loss": 0.5376, + "step": 10188, + "task_loss": 0.886990487575531 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6114735007286072, + "epoch": 8.61, + "learning_rate": 7.70639616793463e-06, + "loss": 0.5547, + "step": 10189, + "task_loss": 0.23636624217033386 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.405250608921051, + "epoch": 8.61, + "learning_rate": 7.701700009392318e-06, + "loss": 0.5171, + "step": 10190, + "task_loss": 1.4366767406463623 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5506914258003235, + "epoch": 8.61, + "learning_rate": 7.697003850850006e-06, + "loss": 0.4742, + "step": 10191, + "task_loss": 0.33670493960380554 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.45559728145599365, + "epoch": 8.61, + "learning_rate": 7.692307692307694e-06, + "loss": 0.5543, + "step": 10192, + "task_loss": 0.48883262276649475 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6641194224357605, + "epoch": 8.62, + "learning_rate": 7.68761153376538e-06, + "loss": 0.5918, + "step": 10193, + "task_loss": 0.6632692217826843 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.46645957231521606, + "epoch": 8.62, + "learning_rate": 7.682915375223068e-06, + "loss": 0.5451, + "step": 10194, + "task_loss": 0.8866949081420898 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4706856608390808, + "epoch": 8.62, + "learning_rate": 7.678219216680756e-06, + "loss": 0.6006, + "step": 10195, + "task_loss": 1.0360324382781982 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4576830267906189, + "epoch": 8.62, + "learning_rate": 7.673523058138444e-06, + "loss": 0.6444, + "step": 10196, + "task_loss": 0.6196795701980591 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.41327643394470215, + "epoch": 8.62, + "learning_rate": 7.66882689959613e-06, + "loss": 0.6056, + "step": 10197, + "task_loss": 0.19692403078079224 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.48437443375587463, + "epoch": 8.62, + "learning_rate": 7.664130741053818e-06, + "loss": 0.4912, + "step": 10198, + "task_loss": 0.5671310424804688 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4106687307357788, + "epoch": 8.62, + "learning_rate": 7.659434582511506e-06, + "loss": 0.5622, + "step": 10199, + "task_loss": 0.5250669121742249 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4318590760231018, + "epoch": 8.62, + "learning_rate": 7.654738423969193e-06, + "loss": 0.6081, + "step": 10200, + "task_loss": 1.0596551895141602 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.504088282585144, + "epoch": 8.62, + "learning_rate": 7.65004226542688e-06, + "loss": 0.6584, + "step": 10201, + "task_loss": 0.4361553490161896 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5513283014297485, + "epoch": 8.62, + "learning_rate": 7.645346106884569e-06, + "loss": 0.6109, + "step": 10202, + "task_loss": 1.122445821762085 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5457226037979126, + "epoch": 8.62, + "learning_rate": 7.640649948342255e-06, + "loss": 0.6439, + "step": 10203, + "task_loss": 0.7230810523033142 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6957589387893677, + "epoch": 8.63, + "learning_rate": 7.635953789799943e-06, + "loss": 0.5725, + "step": 10204, + "task_loss": 1.2208600044250488 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.47404998540878296, + "epoch": 8.63, + "learning_rate": 7.631257631257633e-06, + "loss": 0.655, + "step": 10205, + "task_loss": 0.7040433287620544 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.41922643780708313, + "epoch": 8.63, + "learning_rate": 7.62656147271532e-06, + "loss": 0.5357, + "step": 10206, + "task_loss": 1.1510061025619507 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3961447775363922, + "epoch": 8.63, + "learning_rate": 7.621865314173007e-06, + "loss": 0.6358, + "step": 10207, + "task_loss": 0.3043938875198364 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8081384897232056, + "epoch": 8.63, + "learning_rate": 7.6171691556306945e-06, + "loss": 0.6044, + "step": 10208, + "task_loss": 0.9309658408164978 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5744142532348633, + "epoch": 8.63, + "learning_rate": 7.6124729970883825e-06, + "loss": 0.4515, + "step": 10209, + "task_loss": 1.6275269985198975 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.626166820526123, + "epoch": 8.63, + "learning_rate": 7.60777683854607e-06, + "loss": 0.5905, + "step": 10210, + "task_loss": 0.8756242990493774 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4888893961906433, + "epoch": 8.63, + "learning_rate": 7.603080680003758e-06, + "loss": 0.4547, + "step": 10211, + "task_loss": 1.5373225212097168 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.46328017115592957, + "epoch": 8.63, + "learning_rate": 7.598384521461445e-06, + "loss": 0.6491, + "step": 10212, + "task_loss": 0.7292001247406006 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5929757356643677, + "epoch": 8.63, + "learning_rate": 7.593688362919132e-06, + "loss": 0.6086, + "step": 10213, + "task_loss": 0.9026742577552795 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6763771176338196, + "epoch": 8.63, + "learning_rate": 7.58899220437682e-06, + "loss": 0.5799, + "step": 10214, + "task_loss": 0.40101462602615356 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9702361822128296, + "epoch": 8.63, + "learning_rate": 7.584296045834507e-06, + "loss": 0.7785, + "step": 10215, + "task_loss": 1.3708608150482178 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7873521447181702, + "epoch": 8.64, + "learning_rate": 7.579599887292195e-06, + "loss": 0.6442, + "step": 10216, + "task_loss": 1.1339277029037476 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.548207700252533, + "epoch": 8.64, + "learning_rate": 7.5749037287498826e-06, + "loss": 0.4793, + "step": 10217, + "task_loss": 0.3917886018753052 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.49768316745758057, + "epoch": 8.64, + "learning_rate": 7.57020757020757e-06, + "loss": 0.6862, + "step": 10218, + "task_loss": 0.9330788254737854 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4053627848625183, + "epoch": 8.64, + "learning_rate": 7.565511411665258e-06, + "loss": 0.4753, + "step": 10219, + "task_loss": 0.6663283705711365 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6307308673858643, + "epoch": 8.64, + "learning_rate": 7.560815253122947e-06, + "loss": 0.5704, + "step": 10220, + "task_loss": 1.2533655166625977 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7778764963150024, + "epoch": 8.64, + "learning_rate": 7.556119094580634e-06, + "loss": 0.5699, + "step": 10221, + "task_loss": 1.328901767730713 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.41368865966796875, + "epoch": 8.64, + "learning_rate": 7.551422936038322e-06, + "loss": 0.4238, + "step": 10222, + "task_loss": 0.8493715524673462 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3119378089904785, + "epoch": 8.64, + "learning_rate": 7.546726777496009e-06, + "loss": 0.5276, + "step": 10223, + "task_loss": 0.5483587384223938 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.43674248456954956, + "epoch": 8.64, + "learning_rate": 7.542030618953696e-06, + "loss": 0.5052, + "step": 10224, + "task_loss": 0.14912505447864532 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3352706730365753, + "epoch": 8.64, + "learning_rate": 7.537334460411384e-06, + "loss": 0.5695, + "step": 10225, + "task_loss": 0.3992350399494171 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6370732188224792, + "epoch": 8.64, + "learning_rate": 7.5326383018690715e-06, + "loss": 0.6808, + "step": 10226, + "task_loss": 0.11418670415878296 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.47148245573043823, + "epoch": 8.64, + "learning_rate": 7.5279421433267595e-06, + "loss": 0.5324, + "step": 10227, + "task_loss": 1.2116204500198364 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6872268915176392, + "epoch": 8.65, + "learning_rate": 7.523245984784447e-06, + "loss": 0.5556, + "step": 10228, + "task_loss": 0.47657063603401184 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.42959070205688477, + "epoch": 8.65, + "learning_rate": 7.518549826242134e-06, + "loss": 0.6317, + "step": 10229, + "task_loss": 0.5774355530738831 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6065131425857544, + "epoch": 8.65, + "learning_rate": 7.513853667699822e-06, + "loss": 0.6498, + "step": 10230, + "task_loss": 1.0525391101837158 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3681030869483948, + "epoch": 8.65, + "learning_rate": 7.509157509157509e-06, + "loss": 0.561, + "step": 10231, + "task_loss": 0.3069378137588501 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3902667164802551, + "epoch": 8.65, + "learning_rate": 7.504461350615197e-06, + "loss": 0.5837, + "step": 10232, + "task_loss": 0.3778567910194397 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8332396149635315, + "epoch": 8.65, + "learning_rate": 7.499765192072884e-06, + "loss": 0.4939, + "step": 10233, + "task_loss": 0.9043023586273193 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4237216114997864, + "epoch": 8.65, + "learning_rate": 7.4950690335305715e-06, + "loss": 0.6901, + "step": 10234, + "task_loss": 0.6827937364578247 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6342553496360779, + "epoch": 8.65, + "learning_rate": 7.4903728749882596e-06, + "loss": 0.6429, + "step": 10235, + "task_loss": 1.0258148908615112 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.39313584566116333, + "epoch": 8.65, + "learning_rate": 7.4856767164459484e-06, + "loss": 0.4252, + "step": 10236, + "task_loss": 1.0365196466445923 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.25762152671813965, + "epoch": 8.65, + "learning_rate": 7.480980557903636e-06, + "loss": 0.4732, + "step": 10237, + "task_loss": 0.3036763370037079 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8248552680015564, + "epoch": 8.65, + "learning_rate": 7.476284399361324e-06, + "loss": 0.6466, + "step": 10238, + "task_loss": 0.27214887738227844 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8097308874130249, + "epoch": 8.65, + "learning_rate": 7.471588240819011e-06, + "loss": 0.5922, + "step": 10239, + "task_loss": 0.7543125748634338 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3951740562915802, + "epoch": 8.66, + "learning_rate": 7.466892082276698e-06, + "loss": 0.5256, + "step": 10240, + "task_loss": 1.6051669120788574 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.37878715991973877, + "epoch": 8.66, + "learning_rate": 7.462195923734386e-06, + "loss": 0.5728, + "step": 10241, + "task_loss": 0.06755638867616653 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7129075527191162, + "epoch": 8.66, + "learning_rate": 7.457499765192073e-06, + "loss": 0.5835, + "step": 10242, + "task_loss": 0.6745591759681702 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6269041299819946, + "epoch": 8.66, + "learning_rate": 7.4528036066497604e-06, + "loss": 0.5542, + "step": 10243, + "task_loss": 0.3346305787563324 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.1056039333343506, + "epoch": 8.66, + "learning_rate": 7.4481074481074485e-06, + "loss": 0.8226, + "step": 10244, + "task_loss": 1.3281147480010986 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5918039083480835, + "epoch": 8.66, + "learning_rate": 7.443411289565136e-06, + "loss": 0.5547, + "step": 10245, + "task_loss": 0.7364872097969055 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4488946795463562, + "epoch": 8.66, + "learning_rate": 7.438715131022824e-06, + "loss": 0.5398, + "step": 10246, + "task_loss": 0.5460629463195801 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.45832908153533936, + "epoch": 8.66, + "learning_rate": 7.434018972480511e-06, + "loss": 0.6335, + "step": 10247, + "task_loss": 1.8058630228042603 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6539885997772217, + "epoch": 8.66, + "learning_rate": 7.429322813938198e-06, + "loss": 0.6427, + "step": 10248, + "task_loss": 1.0286424160003662 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3697872459888458, + "epoch": 8.66, + "learning_rate": 7.424626655395886e-06, + "loss": 0.4917, + "step": 10249, + "task_loss": 0.12179650366306305 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5173530578613281, + "epoch": 8.66, + "learning_rate": 7.419930496853573e-06, + "loss": 0.6341, + "step": 10250, + "task_loss": 1.1463367938995361 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4224802553653717, + "epoch": 8.66, + "learning_rate": 7.415234338311262e-06, + "loss": 0.6286, + "step": 10251, + "task_loss": 0.13439907133579254 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6051065921783447, + "epoch": 8.67, + "learning_rate": 7.41053817976895e-06, + "loss": 0.69, + "step": 10252, + "task_loss": 0.7341217994689941 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.409909725189209, + "epoch": 8.67, + "learning_rate": 7.405842021226637e-06, + "loss": 0.5283, + "step": 10253, + "task_loss": 1.1080448627471924 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5361477136611938, + "epoch": 8.67, + "learning_rate": 7.4011458626843246e-06, + "loss": 0.5483, + "step": 10254, + "task_loss": 0.6561297178268433 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8686103820800781, + "epoch": 8.67, + "learning_rate": 7.396449704142013e-06, + "loss": 0.5504, + "step": 10255, + "task_loss": 1.025373101234436 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.565653920173645, + "epoch": 8.67, + "learning_rate": 7.3917535455997e-06, + "loss": 0.519, + "step": 10256, + "task_loss": 0.6099762916564941 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.41845130920410156, + "epoch": 8.67, + "learning_rate": 7.387057387057388e-06, + "loss": 0.5508, + "step": 10257, + "task_loss": 0.6322251558303833 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.503254771232605, + "epoch": 8.67, + "learning_rate": 7.382361228515075e-06, + "loss": 0.5712, + "step": 10258, + "task_loss": 0.5466342568397522 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.42690950632095337, + "epoch": 8.67, + "learning_rate": 7.377665069972762e-06, + "loss": 0.4372, + "step": 10259, + "task_loss": 0.34407803416252136 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.47467920184135437, + "epoch": 8.67, + "learning_rate": 7.37296891143045e-06, + "loss": 0.5751, + "step": 10260, + "task_loss": 0.5128033757209778 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7595347166061401, + "epoch": 8.67, + "learning_rate": 7.3682727528881374e-06, + "loss": 0.565, + "step": 10261, + "task_loss": 1.2179811000823975 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6114000678062439, + "epoch": 8.67, + "learning_rate": 7.3635765943458255e-06, + "loss": 0.5584, + "step": 10262, + "task_loss": 0.29443392157554626 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4346359074115753, + "epoch": 8.67, + "learning_rate": 7.358880435803513e-06, + "loss": 0.4824, + "step": 10263, + "task_loss": 0.5224870443344116 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5443424582481384, + "epoch": 8.68, + "learning_rate": 7.3541842772612e-06, + "loss": 0.4592, + "step": 10264, + "task_loss": 0.7487624287605286 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8096593618392944, + "epoch": 8.68, + "learning_rate": 7.349488118718888e-06, + "loss": 0.617, + "step": 10265, + "task_loss": 1.1723335981369019 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6420505046844482, + "epoch": 8.68, + "learning_rate": 7.344791960176575e-06, + "loss": 0.503, + "step": 10266, + "task_loss": 1.2138831615447998 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8059666156768799, + "epoch": 8.68, + "learning_rate": 7.340095801634264e-06, + "loss": 0.6517, + "step": 10267, + "task_loss": 0.8919260501861572 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.658506453037262, + "epoch": 8.68, + "learning_rate": 7.335399643091952e-06, + "loss": 0.5936, + "step": 10268, + "task_loss": 1.6535675525665283 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3891355097293854, + "epoch": 8.68, + "learning_rate": 7.330703484549639e-06, + "loss": 0.5521, + "step": 10269, + "task_loss": 0.33551591634750366 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8669257164001465, + "epoch": 8.68, + "learning_rate": 7.326007326007326e-06, + "loss": 0.5193, + "step": 10270, + "task_loss": 0.8533660173416138 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.47761911153793335, + "epoch": 8.68, + "learning_rate": 7.321311167465014e-06, + "loss": 0.4264, + "step": 10271, + "task_loss": 0.8564295172691345 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3078145980834961, + "epoch": 8.68, + "learning_rate": 7.3166150089227016e-06, + "loss": 0.399, + "step": 10272, + "task_loss": 0.7565029263496399 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4960970878601074, + "epoch": 8.68, + "learning_rate": 7.31191885038039e-06, + "loss": 0.5368, + "step": 10273, + "task_loss": 2.1294572353363037 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7660937905311584, + "epoch": 8.68, + "learning_rate": 7.307222691838077e-06, + "loss": 0.524, + "step": 10274, + "task_loss": 0.4152833819389343 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3508826494216919, + "epoch": 8.69, + "learning_rate": 7.302526533295764e-06, + "loss": 0.6549, + "step": 10275, + "task_loss": 0.34853848814964294 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.792838454246521, + "epoch": 8.69, + "learning_rate": 7.297830374753452e-06, + "loss": 0.4894, + "step": 10276, + "task_loss": 0.437909871339798 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7205517292022705, + "epoch": 8.69, + "learning_rate": 7.293134216211139e-06, + "loss": 0.5773, + "step": 10277, + "task_loss": 1.1230223178863525 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.787097692489624, + "epoch": 8.69, + "learning_rate": 7.288438057668827e-06, + "loss": 0.5862, + "step": 10278, + "task_loss": 1.151388168334961 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4683302640914917, + "epoch": 8.69, + "learning_rate": 7.283741899126514e-06, + "loss": 0.4567, + "step": 10279, + "task_loss": 0.6024013757705688 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7732101082801819, + "epoch": 8.69, + "learning_rate": 7.279045740584202e-06, + "loss": 0.7833, + "step": 10280, + "task_loss": 2.0467896461486816 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.2833815813064575, + "epoch": 8.69, + "learning_rate": 7.27434958204189e-06, + "loss": 0.8617, + "step": 10281, + "task_loss": 1.1665875911712646 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.563601016998291, + "epoch": 8.69, + "learning_rate": 7.2696534234995785e-06, + "loss": 0.5302, + "step": 10282, + "task_loss": 1.1753880977630615 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3792545199394226, + "epoch": 8.69, + "learning_rate": 7.264957264957266e-06, + "loss": 0.526, + "step": 10283, + "task_loss": 0.45275241136550903 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6064555644989014, + "epoch": 8.69, + "learning_rate": 7.260261106414954e-06, + "loss": 0.4654, + "step": 10284, + "task_loss": 0.3920440077781677 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.43233346939086914, + "epoch": 8.69, + "learning_rate": 7.255564947872641e-06, + "loss": 0.5527, + "step": 10285, + "task_loss": 0.6501238942146301 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5199381113052368, + "epoch": 8.69, + "learning_rate": 7.250868789330328e-06, + "loss": 0.501, + "step": 10286, + "task_loss": 0.22399379312992096 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.504426896572113, + "epoch": 8.7, + "learning_rate": 7.246172630788016e-06, + "loss": 0.6557, + "step": 10287, + "task_loss": 0.46140217781066895 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.41826102137565613, + "epoch": 8.7, + "learning_rate": 7.241476472245703e-06, + "loss": 0.5942, + "step": 10288, + "task_loss": 0.9990370869636536 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6491661071777344, + "epoch": 8.7, + "learning_rate": 7.236780313703391e-06, + "loss": 0.6584, + "step": 10289, + "task_loss": 0.7214502096176147 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.38574081659317017, + "epoch": 8.7, + "learning_rate": 7.2320841551610785e-06, + "loss": 0.4687, + "step": 10290, + "task_loss": 0.6558793187141418 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4913393259048462, + "epoch": 8.7, + "learning_rate": 7.227387996618766e-06, + "loss": 0.4222, + "step": 10291, + "task_loss": 0.8690575361251831 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.621860921382904, + "epoch": 8.7, + "learning_rate": 7.222691838076454e-06, + "loss": 0.7407, + "step": 10292, + "task_loss": 1.5254828929901123 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.42471474409103394, + "epoch": 8.7, + "learning_rate": 7.217995679534141e-06, + "loss": 0.588, + "step": 10293, + "task_loss": 0.5383074283599854 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3305829167366028, + "epoch": 8.7, + "learning_rate": 7.213299520991829e-06, + "loss": 0.7062, + "step": 10294, + "task_loss": 0.030561823397874832 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.2524256408214569, + "epoch": 8.7, + "learning_rate": 7.208603362449516e-06, + "loss": 0.5451, + "step": 10295, + "task_loss": 0.12882976233959198 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.2741049826145172, + "epoch": 8.7, + "learning_rate": 7.203907203907203e-06, + "loss": 0.537, + "step": 10296, + "task_loss": 0.6194612383842468 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6795684099197388, + "epoch": 8.7, + "learning_rate": 7.199211045364892e-06, + "loss": 0.9079, + "step": 10297, + "task_loss": 0.9307163953781128 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.42480748891830444, + "epoch": 8.7, + "learning_rate": 7.19451488682258e-06, + "loss": 0.5103, + "step": 10298, + "task_loss": 0.18790249526500702 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8875046372413635, + "epoch": 8.71, + "learning_rate": 7.1898187282802675e-06, + "loss": 0.678, + "step": 10299, + "task_loss": 0.4898264408111572 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.41284847259521484, + "epoch": 8.71, + "learning_rate": 7.1851225697379555e-06, + "loss": 0.5192, + "step": 10300, + "task_loss": 0.6449827551841736 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6076819896697998, + "epoch": 8.71, + "learning_rate": 7.180426411195643e-06, + "loss": 0.6992, + "step": 10301, + "task_loss": 0.40686729550361633 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.40607112646102905, + "epoch": 8.71, + "learning_rate": 7.17573025265333e-06, + "loss": 0.5669, + "step": 10302, + "task_loss": 0.2284427136182785 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.2656305432319641, + "epoch": 8.71, + "learning_rate": 7.171034094111018e-06, + "loss": 0.5542, + "step": 10303, + "task_loss": 0.3977203369140625 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3871995806694031, + "epoch": 8.71, + "learning_rate": 7.166337935568705e-06, + "loss": 0.4727, + "step": 10304, + "task_loss": 0.9074400663375854 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6045183539390564, + "epoch": 8.71, + "learning_rate": 7.161641777026393e-06, + "loss": 0.6096, + "step": 10305, + "task_loss": 1.744801640510559 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4028138816356659, + "epoch": 8.71, + "learning_rate": 7.15694561848408e-06, + "loss": 0.6847, + "step": 10306, + "task_loss": 0.9243627190589905 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5522833466529846, + "epoch": 8.71, + "learning_rate": 7.1522494599417675e-06, + "loss": 0.7582, + "step": 10307, + "task_loss": 0.7845475077629089 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.32468879222869873, + "epoch": 8.71, + "learning_rate": 7.1475533013994555e-06, + "loss": 0.6019, + "step": 10308, + "task_loss": 0.6948211789131165 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7046904563903809, + "epoch": 8.71, + "learning_rate": 7.142857142857143e-06, + "loss": 0.5528, + "step": 10309, + "task_loss": 0.7529733180999756 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5453115701675415, + "epoch": 8.71, + "learning_rate": 7.13816098431483e-06, + "loss": 0.5532, + "step": 10310, + "task_loss": 0.7622359991073608 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3607589900493622, + "epoch": 8.72, + "learning_rate": 7.133464825772518e-06, + "loss": 0.5302, + "step": 10311, + "task_loss": 0.46916672587394714 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6174506545066833, + "epoch": 8.72, + "learning_rate": 7.128768667230205e-06, + "loss": 0.6919, + "step": 10312, + "task_loss": 0.5448486804962158 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6433306336402893, + "epoch": 8.72, + "learning_rate": 7.124072508687894e-06, + "loss": 0.5014, + "step": 10313, + "task_loss": 0.49146610498428345 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.642281711101532, + "epoch": 8.72, + "learning_rate": 7.119376350145582e-06, + "loss": 0.4231, + "step": 10314, + "task_loss": 0.27000492811203003 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5334990620613098, + "epoch": 8.72, + "learning_rate": 7.114680191603269e-06, + "loss": 0.4966, + "step": 10315, + "task_loss": 1.384645938873291 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5433629155158997, + "epoch": 8.72, + "learning_rate": 7.109984033060957e-06, + "loss": 0.4974, + "step": 10316, + "task_loss": 0.6724228262901306 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3530283272266388, + "epoch": 8.72, + "learning_rate": 7.1052878745186444e-06, + "loss": 0.5629, + "step": 10317, + "task_loss": 1.1564393043518066 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8471308946609497, + "epoch": 8.72, + "learning_rate": 7.100591715976332e-06, + "loss": 0.5103, + "step": 10318, + "task_loss": 0.8414833545684814 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4368123412132263, + "epoch": 8.72, + "learning_rate": 7.09589555743402e-06, + "loss": 0.3904, + "step": 10319, + "task_loss": 0.6403210163116455 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4052436947822571, + "epoch": 8.72, + "learning_rate": 7.091199398891707e-06, + "loss": 0.5497, + "step": 10320, + "task_loss": 0.9208522439002991 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.1501593589782715, + "epoch": 8.72, + "learning_rate": 7.086503240349394e-06, + "loss": 0.7032, + "step": 10321, + "task_loss": 1.1602727174758911 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4892164468765259, + "epoch": 8.72, + "learning_rate": 7.081807081807082e-06, + "loss": 0.5206, + "step": 10322, + "task_loss": 0.23344182968139648 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5994504690170288, + "epoch": 8.73, + "learning_rate": 7.077110923264769e-06, + "loss": 0.6369, + "step": 10323, + "task_loss": 0.3932313621044159 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5704218149185181, + "epoch": 8.73, + "learning_rate": 7.072414764722457e-06, + "loss": 0.5708, + "step": 10324, + "task_loss": 0.7841705083847046 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4839526414871216, + "epoch": 8.73, + "learning_rate": 7.0677186061801445e-06, + "loss": 0.5433, + "step": 10325, + "task_loss": 0.36781391501426697 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.47805771231651306, + "epoch": 8.73, + "learning_rate": 7.063022447637832e-06, + "loss": 0.8155, + "step": 10326, + "task_loss": 1.5911816358566284 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9234185218811035, + "epoch": 8.73, + "learning_rate": 7.05832628909552e-06, + "loss": 0.5984, + "step": 10327, + "task_loss": 1.1262027025222778 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4731239974498749, + "epoch": 8.73, + "learning_rate": 7.053630130553209e-06, + "loss": 0.6217, + "step": 10328, + "task_loss": 0.8451111316680908 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7011232376098633, + "epoch": 8.73, + "learning_rate": 7.048933972010896e-06, + "loss": 0.6255, + "step": 10329, + "task_loss": 0.9285666346549988 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7590755820274353, + "epoch": 8.73, + "learning_rate": 7.044237813468584e-06, + "loss": 0.605, + "step": 10330, + "task_loss": 0.5850201845169067 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6208679676055908, + "epoch": 8.73, + "learning_rate": 7.039541654926271e-06, + "loss": 0.6377, + "step": 10331, + "task_loss": 0.5315170884132385 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.39641034603118896, + "epoch": 8.73, + "learning_rate": 7.034845496383958e-06, + "loss": 0.53, + "step": 10332, + "task_loss": 1.0273722410202026 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7928783893585205, + "epoch": 8.73, + "learning_rate": 7.030149337841646e-06, + "loss": 0.5447, + "step": 10333, + "task_loss": 0.4759225845336914 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3300149738788605, + "epoch": 8.73, + "learning_rate": 7.025453179299333e-06, + "loss": 0.515, + "step": 10334, + "task_loss": 0.19636566936969757 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4446650445461273, + "epoch": 8.74, + "learning_rate": 7.0207570207570214e-06, + "loss": 0.4545, + "step": 10335, + "task_loss": 0.1241639032959938 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5092456340789795, + "epoch": 8.74, + "learning_rate": 7.016060862214709e-06, + "loss": 0.6787, + "step": 10336, + "task_loss": 1.131974697113037 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9969494938850403, + "epoch": 8.74, + "learning_rate": 7.011364703672396e-06, + "loss": 0.5936, + "step": 10337, + "task_loss": 1.2124367952346802 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7758007049560547, + "epoch": 8.74, + "learning_rate": 7.006668545130084e-06, + "loss": 0.5903, + "step": 10338, + "task_loss": 1.1535673141479492 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9242762923240662, + "epoch": 8.74, + "learning_rate": 7.001972386587771e-06, + "loss": 0.7378, + "step": 10339, + "task_loss": 2.1080024242401123 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.1646398305892944, + "epoch": 8.74, + "learning_rate": 6.997276228045459e-06, + "loss": 0.7864, + "step": 10340, + "task_loss": 1.020667314529419 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.48240146040916443, + "epoch": 8.74, + "learning_rate": 6.992580069503146e-06, + "loss": 0.5039, + "step": 10341, + "task_loss": 0.801193118095398 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.546881914138794, + "epoch": 8.74, + "learning_rate": 6.9878839109608334e-06, + "loss": 0.5739, + "step": 10342, + "task_loss": 0.7922036647796631 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5985745191574097, + "epoch": 8.74, + "learning_rate": 6.9831877524185215e-06, + "loss": 0.5882, + "step": 10343, + "task_loss": 0.7803962230682373 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5321500897407532, + "epoch": 8.74, + "learning_rate": 6.97849159387621e-06, + "loss": 0.5902, + "step": 10344, + "task_loss": 1.0890376567840576 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4115813672542572, + "epoch": 8.74, + "learning_rate": 6.9737954353338975e-06, + "loss": 0.5625, + "step": 10345, + "task_loss": 0.5858675241470337 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5150871276855469, + "epoch": 8.75, + "learning_rate": 6.9690992767915856e-06, + "loss": 0.5448, + "step": 10346, + "task_loss": 1.65718674659729 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6447421312332153, + "epoch": 8.75, + "learning_rate": 6.964403118249273e-06, + "loss": 0.5909, + "step": 10347, + "task_loss": 0.6460264921188354 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5438698530197144, + "epoch": 8.75, + "learning_rate": 6.95970695970696e-06, + "loss": 0.4946, + "step": 10348, + "task_loss": 0.27395099401474 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5077297687530518, + "epoch": 8.75, + "learning_rate": 6.955010801164648e-06, + "loss": 0.6026, + "step": 10349, + "task_loss": 0.3998420536518097 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.2089540660381317, + "epoch": 8.75, + "learning_rate": 6.950314642622335e-06, + "loss": 0.4381, + "step": 10350, + "task_loss": 0.23408906161785126 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5050946474075317, + "epoch": 8.75, + "learning_rate": 6.945618484080023e-06, + "loss": 0.612, + "step": 10351, + "task_loss": 1.038474440574646 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6926410794258118, + "epoch": 8.75, + "learning_rate": 6.94092232553771e-06, + "loss": 0.5846, + "step": 10352, + "task_loss": 0.8045667409896851 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7609493732452393, + "epoch": 8.75, + "learning_rate": 6.9362261669953976e-06, + "loss": 0.7227, + "step": 10353, + "task_loss": 1.123343825340271 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7818836569786072, + "epoch": 8.75, + "learning_rate": 6.931530008453086e-06, + "loss": 0.7368, + "step": 10354, + "task_loss": 0.962765634059906 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9259731769561768, + "epoch": 8.75, + "learning_rate": 6.926833849910773e-06, + "loss": 0.545, + "step": 10355, + "task_loss": 1.2098792791366577 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5749423503875732, + "epoch": 8.75, + "learning_rate": 6.922137691368461e-06, + "loss": 0.647, + "step": 10356, + "task_loss": 0.8294190764427185 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5515408515930176, + "epoch": 8.75, + "learning_rate": 6.917441532826148e-06, + "loss": 0.4932, + "step": 10357, + "task_loss": 0.49761950969696045 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.46829307079315186, + "epoch": 8.76, + "learning_rate": 6.912745374283835e-06, + "loss": 0.5233, + "step": 10358, + "task_loss": 0.3312387466430664 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5920547246932983, + "epoch": 8.76, + "learning_rate": 6.908049215741524e-06, + "loss": 0.6569, + "step": 10359, + "task_loss": 1.524283766746521 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.824539303779602, + "epoch": 8.76, + "learning_rate": 6.903353057199212e-06, + "loss": 0.5025, + "step": 10360, + "task_loss": 1.139722466468811 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.440110445022583, + "epoch": 8.76, + "learning_rate": 6.898656898656899e-06, + "loss": 0.6019, + "step": 10361, + "task_loss": 0.2952304482460022 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5744550824165344, + "epoch": 8.76, + "learning_rate": 6.893960740114587e-06, + "loss": 0.5686, + "step": 10362, + "task_loss": 0.7930154800415039 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8894174098968506, + "epoch": 8.76, + "learning_rate": 6.8892645815722745e-06, + "loss": 0.6205, + "step": 10363, + "task_loss": 0.7520231008529663 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4098445773124695, + "epoch": 8.76, + "learning_rate": 6.884568423029962e-06, + "loss": 0.5809, + "step": 10364, + "task_loss": 0.4273567497730255 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.47490426898002625, + "epoch": 8.76, + "learning_rate": 6.87987226448765e-06, + "loss": 0.7179, + "step": 10365, + "task_loss": 0.40222540497779846 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.38343924283981323, + "epoch": 8.76, + "learning_rate": 6.875176105945337e-06, + "loss": 0.4499, + "step": 10366, + "task_loss": 0.7117496132850647 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5507478713989258, + "epoch": 8.76, + "learning_rate": 6.870479947403025e-06, + "loss": 0.5673, + "step": 10367, + "task_loss": 0.4095752239227295 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5391383171081543, + "epoch": 8.76, + "learning_rate": 6.865783788860712e-06, + "loss": 0.7056, + "step": 10368, + "task_loss": 0.2525249421596527 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.003394603729248, + "epoch": 8.76, + "learning_rate": 6.861087630318399e-06, + "loss": 0.7911, + "step": 10369, + "task_loss": 1.281363844871521 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.47802233695983887, + "epoch": 8.77, + "learning_rate": 6.856391471776087e-06, + "loss": 0.4298, + "step": 10370, + "task_loss": 1.5152432918548584 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.24873724579811096, + "epoch": 8.77, + "learning_rate": 6.8516953132337745e-06, + "loss": 0.5605, + "step": 10371, + "task_loss": 0.3715301752090454 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6261681914329529, + "epoch": 8.77, + "learning_rate": 6.846999154691463e-06, + "loss": 0.475, + "step": 10372, + "task_loss": 0.27044540643692017 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5969523787498474, + "epoch": 8.77, + "learning_rate": 6.84230299614915e-06, + "loss": 0.5063, + "step": 10373, + "task_loss": 0.7095768451690674 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4579583406448364, + "epoch": 8.77, + "learning_rate": 6.837606837606839e-06, + "loss": 0.5751, + "step": 10374, + "task_loss": 0.27386367321014404 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7495284080505371, + "epoch": 8.77, + "learning_rate": 6.832910679064526e-06, + "loss": 0.5947, + "step": 10375, + "task_loss": 0.4731720983982086 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5301280617713928, + "epoch": 8.77, + "learning_rate": 6.828214520522214e-06, + "loss": 0.4248, + "step": 10376, + "task_loss": 0.32319796085357666 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.29669851064682007, + "epoch": 8.77, + "learning_rate": 6.823518361979901e-06, + "loss": 0.4883, + "step": 10377, + "task_loss": 0.4422501027584076 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.23853106796741486, + "epoch": 8.77, + "learning_rate": 6.818822203437589e-06, + "loss": 0.4753, + "step": 10378, + "task_loss": 0.06537524610757828 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4235384166240692, + "epoch": 8.77, + "learning_rate": 6.814126044895276e-06, + "loss": 0.5502, + "step": 10379, + "task_loss": 0.5021094083786011 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.43977999687194824, + "epoch": 8.77, + "learning_rate": 6.8094298863529635e-06, + "loss": 0.5426, + "step": 10380, + "task_loss": 1.261482834815979 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4326688051223755, + "epoch": 8.77, + "learning_rate": 6.8047337278106515e-06, + "loss": 0.5728, + "step": 10381, + "task_loss": 0.1569068729877472 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.2495833933353424, + "epoch": 8.78, + "learning_rate": 6.800037569268339e-06, + "loss": 0.4239, + "step": 10382, + "task_loss": 0.8380419611930847 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.44397997856140137, + "epoch": 8.78, + "learning_rate": 6.795341410726027e-06, + "loss": 0.612, + "step": 10383, + "task_loss": 0.8200160264968872 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6290370225906372, + "epoch": 8.78, + "learning_rate": 6.790645252183714e-06, + "loss": 0.6999, + "step": 10384, + "task_loss": 0.29773586988449097 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4930514097213745, + "epoch": 8.78, + "learning_rate": 6.785949093641401e-06, + "loss": 0.4861, + "step": 10385, + "task_loss": 0.42606353759765625 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6975401043891907, + "epoch": 8.78, + "learning_rate": 6.781252935099089e-06, + "loss": 0.6302, + "step": 10386, + "task_loss": 0.9619995951652527 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9207071661949158, + "epoch": 8.78, + "learning_rate": 6.776556776556776e-06, + "loss": 0.5637, + "step": 10387, + "task_loss": 1.1698555946350098 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.2553858160972595, + "epoch": 8.78, + "learning_rate": 6.7718606180144635e-06, + "loss": 0.4198, + "step": 10388, + "task_loss": 0.3329714834690094 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6454405784606934, + "epoch": 8.78, + "learning_rate": 6.7671644594721515e-06, + "loss": 0.7077, + "step": 10389, + "task_loss": 1.0938903093338013 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.4060289859771729, + "epoch": 8.78, + "learning_rate": 6.76246830092984e-06, + "loss": 0.8242, + "step": 10390, + "task_loss": 0.9555860757827759 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3893490135669708, + "epoch": 8.78, + "learning_rate": 6.757772142387528e-06, + "loss": 0.5632, + "step": 10391, + "task_loss": 0.7246307134628296 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5024700164794922, + "epoch": 8.78, + "learning_rate": 6.753075983845216e-06, + "loss": 0.5813, + "step": 10392, + "task_loss": 0.7267103791236877 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7856003642082214, + "epoch": 8.78, + "learning_rate": 6.748379825302903e-06, + "loss": 0.5567, + "step": 10393, + "task_loss": 1.0447009801864624 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5717740058898926, + "epoch": 8.79, + "learning_rate": 6.743683666760591e-06, + "loss": 0.6829, + "step": 10394, + "task_loss": 0.6524147391319275 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9063979983329773, + "epoch": 8.79, + "learning_rate": 6.738987508218278e-06, + "loss": 0.5572, + "step": 10395, + "task_loss": 0.7100487351417542 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4303475022315979, + "epoch": 8.79, + "learning_rate": 6.734291349675965e-06, + "loss": 0.4604, + "step": 10396, + "task_loss": 0.4483071267604828 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5285152196884155, + "epoch": 8.79, + "learning_rate": 6.729595191133653e-06, + "loss": 0.8343, + "step": 10397, + "task_loss": 0.3597392737865448 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6513479948043823, + "epoch": 8.79, + "learning_rate": 6.7248990325913404e-06, + "loss": 0.468, + "step": 10398, + "task_loss": 0.11653785407543182 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.2905402183532715, + "epoch": 8.79, + "learning_rate": 6.720202874049028e-06, + "loss": 0.5471, + "step": 10399, + "task_loss": 0.5148707032203674 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6005892157554626, + "epoch": 8.79, + "learning_rate": 6.715506715506716e-06, + "loss": 0.5781, + "step": 10400, + "task_loss": 1.1965886354446411 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5519025325775146, + "epoch": 8.79, + "learning_rate": 6.710810556964403e-06, + "loss": 0.5018, + "step": 10401, + "task_loss": 0.628398597240448 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3725884258747101, + "epoch": 8.79, + "learning_rate": 6.706114398422091e-06, + "loss": 0.5824, + "step": 10402, + "task_loss": 0.3823162615299225 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3396443724632263, + "epoch": 8.79, + "learning_rate": 6.701418239879778e-06, + "loss": 0.4836, + "step": 10403, + "task_loss": 0.46902385354042053 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6300222873687744, + "epoch": 8.79, + "learning_rate": 6.696722081337465e-06, + "loss": 0.6315, + "step": 10404, + "task_loss": 0.5616195201873779 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6996287107467651, + "epoch": 8.79, + "learning_rate": 6.692025922795154e-06, + "loss": 0.6816, + "step": 10405, + "task_loss": 0.4466788172721863 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.42925891280174255, + "epoch": 8.8, + "learning_rate": 6.687329764252842e-06, + "loss": 0.4206, + "step": 10406, + "task_loss": 1.5744917392730713 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5955492258071899, + "epoch": 8.8, + "learning_rate": 6.682633605710529e-06, + "loss": 0.4532, + "step": 10407, + "task_loss": 0.784270167350769 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3209541440010071, + "epoch": 8.8, + "learning_rate": 6.677937447168217e-06, + "loss": 0.4188, + "step": 10408, + "task_loss": 0.6802537441253662 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6133323311805725, + "epoch": 8.8, + "learning_rate": 6.673241288625905e-06, + "loss": 0.5299, + "step": 10409, + "task_loss": 0.21387360990047455 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4159771502017975, + "epoch": 8.8, + "learning_rate": 6.668545130083592e-06, + "loss": 0.3602, + "step": 10410, + "task_loss": 0.21253615617752075 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7224434614181519, + "epoch": 8.8, + "learning_rate": 6.66384897154128e-06, + "loss": 0.6049, + "step": 10411, + "task_loss": 1.0532963275909424 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6067714691162109, + "epoch": 8.8, + "learning_rate": 6.659152812998967e-06, + "loss": 0.6459, + "step": 10412, + "task_loss": 1.8054485321044922 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5165995955467224, + "epoch": 8.8, + "learning_rate": 6.654456654456655e-06, + "loss": 0.5703, + "step": 10413, + "task_loss": 0.8915274739265442 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5776345729827881, + "epoch": 8.8, + "learning_rate": 6.649760495914342e-06, + "loss": 0.5018, + "step": 10414, + "task_loss": 0.7522205710411072 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6961765289306641, + "epoch": 8.8, + "learning_rate": 6.645064337372029e-06, + "loss": 0.6906, + "step": 10415, + "task_loss": 1.2775108814239502 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4550372362136841, + "epoch": 8.8, + "learning_rate": 6.6403681788297174e-06, + "loss": 0.5347, + "step": 10416, + "task_loss": 0.16290467977523804 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6905685663223267, + "epoch": 8.81, + "learning_rate": 6.635672020287405e-06, + "loss": 0.6114, + "step": 10417, + "task_loss": 1.5535417795181274 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.75383061170578, + "epoch": 8.81, + "learning_rate": 6.630975861745093e-06, + "loss": 0.6003, + "step": 10418, + "task_loss": 0.9565126895904541 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.49867987632751465, + "epoch": 8.81, + "learning_rate": 6.62627970320278e-06, + "loss": 0.6153, + "step": 10419, + "task_loss": 0.46796485781669617 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4652766287326813, + "epoch": 8.81, + "learning_rate": 6.621583544660467e-06, + "loss": 0.5645, + "step": 10420, + "task_loss": 1.2161763906478882 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6370488405227661, + "epoch": 8.81, + "learning_rate": 6.616887386118156e-06, + "loss": 0.5869, + "step": 10421, + "task_loss": 0.8086147904396057 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.2697755694389343, + "epoch": 8.81, + "learning_rate": 6.612191227575844e-06, + "loss": 0.5027, + "step": 10422, + "task_loss": 0.624340832233429 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.38525134325027466, + "epoch": 8.81, + "learning_rate": 6.607495069033531e-06, + "loss": 0.6076, + "step": 10423, + "task_loss": 0.9852173924446106 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7979705333709717, + "epoch": 8.81, + "learning_rate": 6.602798910491219e-06, + "loss": 0.5747, + "step": 10424, + "task_loss": 0.4745531976222992 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3308601379394531, + "epoch": 8.81, + "learning_rate": 6.598102751948906e-06, + "loss": 0.4769, + "step": 10425, + "task_loss": 0.5257015228271484 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.34645822644233704, + "epoch": 8.81, + "learning_rate": 6.5934065934065935e-06, + "loss": 0.6759, + "step": 10426, + "task_loss": 0.26383987069129944 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6514652967453003, + "epoch": 8.81, + "learning_rate": 6.5887104348642816e-06, + "loss": 0.5821, + "step": 10427, + "task_loss": 1.3307198286056519 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.512481689453125, + "epoch": 8.81, + "learning_rate": 6.584014276321969e-06, + "loss": 0.4809, + "step": 10428, + "task_loss": 0.9226614832878113 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4696445167064667, + "epoch": 8.82, + "learning_rate": 6.579318117779657e-06, + "loss": 0.4013, + "step": 10429, + "task_loss": 0.8072288632392883 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7221746444702148, + "epoch": 8.82, + "learning_rate": 6.574621959237344e-06, + "loss": 0.6356, + "step": 10430, + "task_loss": 0.8518087863922119 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3668014705181122, + "epoch": 8.82, + "learning_rate": 6.569925800695031e-06, + "loss": 0.5083, + "step": 10431, + "task_loss": 1.0403099060058594 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4611468017101288, + "epoch": 8.82, + "learning_rate": 6.565229642152719e-06, + "loss": 0.62, + "step": 10432, + "task_loss": 1.0703948736190796 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.39971137046813965, + "epoch": 8.82, + "learning_rate": 6.560533483610406e-06, + "loss": 0.5758, + "step": 10433, + "task_loss": 0.391137033700943 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4856651723384857, + "epoch": 8.82, + "learning_rate": 6.555837325068094e-06, + "loss": 0.4485, + "step": 10434, + "task_loss": 0.25058242678642273 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4118480384349823, + "epoch": 8.82, + "learning_rate": 6.551141166525782e-06, + "loss": 0.5063, + "step": 10435, + "task_loss": 0.7992642521858215 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5740260481834412, + "epoch": 8.82, + "learning_rate": 6.5464450079834705e-06, + "loss": 0.7054, + "step": 10436, + "task_loss": 0.5549280047416687 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9405794739723206, + "epoch": 8.82, + "learning_rate": 6.541748849441158e-06, + "loss": 0.6946, + "step": 10437, + "task_loss": 1.5863858461380005 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5440404415130615, + "epoch": 8.82, + "learning_rate": 6.537052690898846e-06, + "loss": 0.54, + "step": 10438, + "task_loss": 1.5026124715805054 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7753065228462219, + "epoch": 8.82, + "learning_rate": 6.532356532356533e-06, + "loss": 0.5878, + "step": 10439, + "task_loss": 0.5904995203018188 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.1920926570892334, + "epoch": 8.82, + "learning_rate": 6.527660373814221e-06, + "loss": 0.7118, + "step": 10440, + "task_loss": 1.282610297203064 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5998774766921997, + "epoch": 8.83, + "learning_rate": 6.522964215271908e-06, + "loss": 0.5666, + "step": 10441, + "task_loss": 0.7203288078308105 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.556732177734375, + "epoch": 8.83, + "learning_rate": 6.518268056729595e-06, + "loss": 0.6738, + "step": 10442, + "task_loss": 0.6162696480751038 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5914292335510254, + "epoch": 8.83, + "learning_rate": 6.513571898187283e-06, + "loss": 0.4567, + "step": 10443, + "task_loss": 1.6884870529174805 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6201949119567871, + "epoch": 8.83, + "learning_rate": 6.5088757396449705e-06, + "loss": 0.6272, + "step": 10444, + "task_loss": 0.8216149806976318 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6271955966949463, + "epoch": 8.83, + "learning_rate": 6.5041795811026586e-06, + "loss": 0.6692, + "step": 10445, + "task_loss": 0.11766261607408524 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5160413384437561, + "epoch": 8.83, + "learning_rate": 6.499483422560346e-06, + "loss": 0.441, + "step": 10446, + "task_loss": 0.30405688285827637 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5667554140090942, + "epoch": 8.83, + "learning_rate": 6.494787264018033e-06, + "loss": 0.6421, + "step": 10447, + "task_loss": 1.0299257040023804 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6120223999023438, + "epoch": 8.83, + "learning_rate": 6.490091105475721e-06, + "loss": 0.5278, + "step": 10448, + "task_loss": 0.4050830006599426 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5379377603530884, + "epoch": 8.83, + "learning_rate": 6.485394946933408e-06, + "loss": 0.6638, + "step": 10449, + "task_loss": 0.7679821848869324 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4935733675956726, + "epoch": 8.83, + "learning_rate": 6.480698788391096e-06, + "loss": 0.6012, + "step": 10450, + "task_loss": 0.7826787233352661 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8047462701797485, + "epoch": 8.83, + "learning_rate": 6.476002629848785e-06, + "loss": 0.8206, + "step": 10451, + "task_loss": 1.1417081356048584 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.45155438780784607, + "epoch": 8.83, + "learning_rate": 6.471306471306472e-06, + "loss": 0.5956, + "step": 10452, + "task_loss": 0.12050943821668625 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4839274287223816, + "epoch": 8.84, + "learning_rate": 6.4666103127641594e-06, + "loss": 0.5678, + "step": 10453, + "task_loss": 0.28474777936935425 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9469181299209595, + "epoch": 8.84, + "learning_rate": 6.4619141542218475e-06, + "loss": 0.5997, + "step": 10454, + "task_loss": 1.0008409023284912 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0321787595748901, + "epoch": 8.84, + "learning_rate": 6.457217995679535e-06, + "loss": 0.5809, + "step": 10455, + "task_loss": 0.8065201044082642 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4036603271961212, + "epoch": 8.84, + "learning_rate": 6.452521837137223e-06, + "loss": 0.5835, + "step": 10456, + "task_loss": 1.1362075805664062 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.1988568305969238, + "epoch": 8.84, + "learning_rate": 6.44782567859491e-06, + "loss": 0.7116, + "step": 10457, + "task_loss": 0.8144145011901855 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.36128729581832886, + "epoch": 8.84, + "learning_rate": 6.443129520052597e-06, + "loss": 0.4231, + "step": 10458, + "task_loss": 1.609217643737793 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6983079314231873, + "epoch": 8.84, + "learning_rate": 6.438433361510285e-06, + "loss": 0.508, + "step": 10459, + "task_loss": 0.6522270441055298 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.42222023010253906, + "epoch": 8.84, + "learning_rate": 6.433737202967972e-06, + "loss": 0.5058, + "step": 10460, + "task_loss": 0.7344433665275574 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.46815043687820435, + "epoch": 8.84, + "learning_rate": 6.42904104442566e-06, + "loss": 0.5615, + "step": 10461, + "task_loss": 0.7782984375953674 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.40455693006515503, + "epoch": 8.84, + "learning_rate": 6.4243448858833475e-06, + "loss": 0.6212, + "step": 10462, + "task_loss": 0.916233241558075 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5114191770553589, + "epoch": 8.84, + "learning_rate": 6.419648727341035e-06, + "loss": 0.6404, + "step": 10463, + "task_loss": 0.49829959869384766 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.39242225885391235, + "epoch": 8.84, + "learning_rate": 6.414952568798723e-06, + "loss": 0.4748, + "step": 10464, + "task_loss": 0.5040316581726074 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3969857096672058, + "epoch": 8.85, + "learning_rate": 6.41025641025641e-06, + "loss": 0.4998, + "step": 10465, + "task_loss": 0.7113975882530212 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4179069995880127, + "epoch": 8.85, + "learning_rate": 6.405560251714097e-06, + "loss": 0.5858, + "step": 10466, + "task_loss": 0.6395835280418396 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.24902011454105377, + "epoch": 8.85, + "learning_rate": 6.400864093171787e-06, + "loss": 0.4231, + "step": 10467, + "task_loss": 1.0332974195480347 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6002901792526245, + "epoch": 8.85, + "learning_rate": 6.396167934629474e-06, + "loss": 0.6387, + "step": 10468, + "task_loss": 0.9690895676612854 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3422654867172241, + "epoch": 8.85, + "learning_rate": 6.391471776087161e-06, + "loss": 0.5816, + "step": 10469, + "task_loss": 1.0971112251281738 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6650229692459106, + "epoch": 8.85, + "learning_rate": 6.386775617544849e-06, + "loss": 0.7185, + "step": 10470, + "task_loss": 1.0368298292160034 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.26948630809783936, + "epoch": 8.85, + "learning_rate": 6.382079459002536e-06, + "loss": 0.5034, + "step": 10471, + "task_loss": 0.6586220264434814 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.49230918288230896, + "epoch": 8.85, + "learning_rate": 6.3773833004602245e-06, + "loss": 0.6279, + "step": 10472, + "task_loss": 1.0231226682662964 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4852588176727295, + "epoch": 8.85, + "learning_rate": 6.372687141917912e-06, + "loss": 0.6573, + "step": 10473, + "task_loss": 0.7053407430648804 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5049886703491211, + "epoch": 8.85, + "learning_rate": 6.367990983375599e-06, + "loss": 0.5053, + "step": 10474, + "task_loss": 0.38399216532707214 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.47920000553131104, + "epoch": 8.85, + "learning_rate": 6.363294824833287e-06, + "loss": 0.501, + "step": 10475, + "task_loss": 0.24468664824962616 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4886174499988556, + "epoch": 8.85, + "learning_rate": 6.358598666290974e-06, + "loss": 0.467, + "step": 10476, + "task_loss": 0.7694076895713806 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7213046550750732, + "epoch": 8.86, + "learning_rate": 6.353902507748661e-06, + "loss": 0.5097, + "step": 10477, + "task_loss": 0.5757997035980225 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3465002179145813, + "epoch": 8.86, + "learning_rate": 6.349206349206349e-06, + "loss": 0.5685, + "step": 10478, + "task_loss": 0.28717365860939026 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8193284869194031, + "epoch": 8.86, + "learning_rate": 6.3445101906640365e-06, + "loss": 0.6364, + "step": 10479, + "task_loss": 0.3689892888069153 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4618656635284424, + "epoch": 8.86, + "learning_rate": 6.3398140321217245e-06, + "loss": 0.4799, + "step": 10480, + "task_loss": 0.685082197189331 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6649463772773743, + "epoch": 8.86, + "learning_rate": 6.335117873579412e-06, + "loss": 0.6973, + "step": 10481, + "task_loss": 1.2995537519454956 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6655111908912659, + "epoch": 8.86, + "learning_rate": 6.3304217150371006e-06, + "loss": 0.7191, + "step": 10482, + "task_loss": 1.5514754056930542 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.872948408126831, + "epoch": 8.86, + "learning_rate": 6.325725556494788e-06, + "loss": 0.6074, + "step": 10483, + "task_loss": 0.46104952692985535 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.47382640838623047, + "epoch": 8.86, + "learning_rate": 6.321029397952476e-06, + "loss": 0.6258, + "step": 10484, + "task_loss": 0.1993219405412674 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6135778427124023, + "epoch": 8.86, + "learning_rate": 6.316333239410163e-06, + "loss": 0.6431, + "step": 10485, + "task_loss": 1.2203813791275024 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.48063546419143677, + "epoch": 8.86, + "learning_rate": 6.311637080867851e-06, + "loss": 0.5087, + "step": 10486, + "task_loss": 0.6722642183303833 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4375566840171814, + "epoch": 8.86, + "learning_rate": 6.306940922325538e-06, + "loss": 0.4939, + "step": 10487, + "task_loss": 0.6924952268600464 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4275206923484802, + "epoch": 8.87, + "learning_rate": 6.302244763783225e-06, + "loss": 0.6227, + "step": 10488, + "task_loss": 1.223399043083191 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4871463477611542, + "epoch": 8.87, + "learning_rate": 6.297548605240913e-06, + "loss": 0.473, + "step": 10489, + "task_loss": 0.7852860689163208 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6707467436790466, + "epoch": 8.87, + "learning_rate": 6.292852446698601e-06, + "loss": 0.661, + "step": 10490, + "task_loss": 0.4987085461616516 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.919183075428009, + "epoch": 8.87, + "learning_rate": 6.288156288156289e-06, + "loss": 0.5969, + "step": 10491, + "task_loss": 0.7220089435577393 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.26325440406799316, + "epoch": 8.87, + "learning_rate": 6.283460129613976e-06, + "loss": 0.5616, + "step": 10492, + "task_loss": 0.7322744727134705 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5973873138427734, + "epoch": 8.87, + "learning_rate": 6.278763971071663e-06, + "loss": 0.6275, + "step": 10493, + "task_loss": 1.1629621982574463 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.48154014348983765, + "epoch": 8.87, + "learning_rate": 6.274067812529351e-06, + "loss": 0.4906, + "step": 10494, + "task_loss": 0.6950964331626892 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7592697143554688, + "epoch": 8.87, + "learning_rate": 6.269371653987038e-06, + "loss": 0.6165, + "step": 10495, + "task_loss": 1.1498048305511475 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6751308441162109, + "epoch": 8.87, + "learning_rate": 6.264675495444726e-06, + "loss": 0.5845, + "step": 10496, + "task_loss": 0.6908773183822632 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5847740173339844, + "epoch": 8.87, + "learning_rate": 6.2599793369024134e-06, + "loss": 0.5012, + "step": 10497, + "task_loss": 0.34573429822921753 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.47404396533966064, + "epoch": 8.87, + "learning_rate": 6.255283178360102e-06, + "loss": 0.5892, + "step": 10498, + "task_loss": 0.39461416006088257 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4307149052619934, + "epoch": 8.87, + "learning_rate": 6.2505870198177895e-06, + "loss": 0.5487, + "step": 10499, + "task_loss": 0.3059642016887665 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6312223672866821, + "epoch": 8.88, + "learning_rate": 6.245890861275477e-06, + "loss": 0.5843, + "step": 10500, + "task_loss": 1.181584358215332 + }, + { + "epoch": 8.88, + "eval_accuracy": 0.9035247524752476, + "eval_loss": 0.36775341629981995, + "eval_runtime": 226.4418, + "eval_samples_per_second": 111.508, + "eval_steps_per_second": 0.874, + "step": 10500 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6175811290740967, + "epoch": 8.88, + "learning_rate": 6.241194702733164e-06, + "loss": 0.5708, + "step": 10501, + "task_loss": 0.7046071887016296 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4059455394744873, + "epoch": 8.88, + "learning_rate": 6.236498544190853e-06, + "loss": 0.6378, + "step": 10502, + "task_loss": 0.7925266623497009 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7190342545509338, + "epoch": 8.88, + "learning_rate": 6.23180238564854e-06, + "loss": 0.6777, + "step": 10503, + "task_loss": 0.825637936592102 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5120416879653931, + "epoch": 8.88, + "learning_rate": 6.227106227106227e-06, + "loss": 0.5795, + "step": 10504, + "task_loss": 1.0261753797531128 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3790232837200165, + "epoch": 8.88, + "learning_rate": 6.222410068563915e-06, + "loss": 0.3805, + "step": 10505, + "task_loss": 0.5463254451751709 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.35531336069107056, + "epoch": 8.88, + "learning_rate": 6.217713910021602e-06, + "loss": 0.4518, + "step": 10506, + "task_loss": 0.3739545941352844 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.49007144570350647, + "epoch": 8.88, + "learning_rate": 6.21301775147929e-06, + "loss": 0.6577, + "step": 10507, + "task_loss": 0.5178561210632324 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7658932209014893, + "epoch": 8.88, + "learning_rate": 6.2083215929369776e-06, + "loss": 0.4989, + "step": 10508, + "task_loss": 0.9057541489601135 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.41150379180908203, + "epoch": 8.88, + "learning_rate": 6.203625434394666e-06, + "loss": 0.5817, + "step": 10509, + "task_loss": 0.5549957752227783 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5359537601470947, + "epoch": 8.88, + "learning_rate": 6.198929275852354e-06, + "loss": 0.4818, + "step": 10510, + "task_loss": 0.24790579080581665 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6541622877120972, + "epoch": 8.88, + "learning_rate": 6.194233117310041e-06, + "loss": 0.5468, + "step": 10511, + "task_loss": 0.73049396276474 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3074570298194885, + "epoch": 8.89, + "learning_rate": 6.189536958767728e-06, + "loss": 0.5419, + "step": 10512, + "task_loss": 0.38451525568962097 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4428836703300476, + "epoch": 8.89, + "learning_rate": 6.184840800225416e-06, + "loss": 0.5495, + "step": 10513, + "task_loss": 1.2117732763290405 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.37948668003082275, + "epoch": 8.89, + "learning_rate": 6.180144641683103e-06, + "loss": 0.3615, + "step": 10514, + "task_loss": 0.5162052512168884 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7426187992095947, + "epoch": 8.89, + "learning_rate": 6.175448483140791e-06, + "loss": 0.6237, + "step": 10515, + "task_loss": 1.619215488433838 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9728736281394958, + "epoch": 8.89, + "learning_rate": 6.1707523245984785e-06, + "loss": 0.7038, + "step": 10516, + "task_loss": 0.3186282217502594 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7929612398147583, + "epoch": 8.89, + "learning_rate": 6.1660561660561665e-06, + "loss": 0.6147, + "step": 10517, + "task_loss": 0.8639477491378784 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5141814351081848, + "epoch": 8.89, + "learning_rate": 6.1613600075138545e-06, + "loss": 0.5702, + "step": 10518, + "task_loss": 0.739983081817627 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4008757174015045, + "epoch": 8.89, + "learning_rate": 6.156663848971542e-06, + "loss": 0.6664, + "step": 10519, + "task_loss": 0.39638882875442505 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7872204184532166, + "epoch": 8.89, + "learning_rate": 6.151967690429229e-06, + "loss": 0.6088, + "step": 10520, + "task_loss": 0.9730578660964966 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.48557716608047485, + "epoch": 8.89, + "learning_rate": 6.147271531886917e-06, + "loss": 0.5429, + "step": 10521, + "task_loss": 0.5730066299438477 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.065480351448059, + "epoch": 8.89, + "learning_rate": 6.142575373344604e-06, + "loss": 0.6306, + "step": 10522, + "task_loss": 1.4081183671951294 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.2601175606250763, + "epoch": 8.89, + "learning_rate": 6.137879214802292e-06, + "loss": 0.4472, + "step": 10523, + "task_loss": 0.035313066095113754 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4732591509819031, + "epoch": 8.9, + "learning_rate": 6.133183056259979e-06, + "loss": 0.5128, + "step": 10524, + "task_loss": 0.22989100217819214 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5309720039367676, + "epoch": 8.9, + "learning_rate": 6.128486897717667e-06, + "loss": 0.6482, + "step": 10525, + "task_loss": 0.4045575261116028 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.25010979175567627, + "epoch": 8.9, + "learning_rate": 6.123790739175355e-06, + "loss": 0.4692, + "step": 10526, + "task_loss": 0.7692723870277405 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.2988647520542145, + "epoch": 8.9, + "learning_rate": 6.119094580633043e-06, + "loss": 0.4172, + "step": 10527, + "task_loss": 0.2577003240585327 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.34933677315711975, + "epoch": 8.9, + "learning_rate": 6.11439842209073e-06, + "loss": 0.4424, + "step": 10528, + "task_loss": 0.5399209856987 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5226921439170837, + "epoch": 8.9, + "learning_rate": 6.109702263548418e-06, + "loss": 0.6551, + "step": 10529, + "task_loss": 0.8803897500038147 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.012583613395691, + "epoch": 8.9, + "learning_rate": 6.105006105006105e-06, + "loss": 0.7396, + "step": 10530, + "task_loss": 2.12973690032959 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6580854654312134, + "epoch": 8.9, + "learning_rate": 6.100309946463793e-06, + "loss": 0.5825, + "step": 10531, + "task_loss": 1.1332801580429077 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.42751938104629517, + "epoch": 8.9, + "learning_rate": 6.095613787921481e-06, + "loss": 0.4451, + "step": 10532, + "task_loss": 1.241970181465149 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9219502806663513, + "epoch": 8.9, + "learning_rate": 6.090917629379168e-06, + "loss": 0.7883, + "step": 10533, + "task_loss": 1.1312386989593506 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5064666271209717, + "epoch": 8.9, + "learning_rate": 6.086221470836856e-06, + "loss": 0.5312, + "step": 10534, + "task_loss": 1.2637144327163696 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0976104736328125, + "epoch": 8.9, + "learning_rate": 6.0815253122945435e-06, + "loss": 0.6865, + "step": 10535, + "task_loss": 1.5269243717193604 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7505024671554565, + "epoch": 8.91, + "learning_rate": 6.076829153752231e-06, + "loss": 0.5784, + "step": 10536, + "task_loss": 0.668350100517273 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3253743350505829, + "epoch": 8.91, + "learning_rate": 6.072132995209919e-06, + "loss": 0.4478, + "step": 10537, + "task_loss": 0.4265812039375305 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.39171114563941956, + "epoch": 8.91, + "learning_rate": 6.067436836667606e-06, + "loss": 0.5411, + "step": 10538, + "task_loss": 1.0479719638824463 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5179042220115662, + "epoch": 8.91, + "learning_rate": 6.062740678125294e-06, + "loss": 0.6212, + "step": 10539, + "task_loss": 1.1634745597839355 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5120424628257751, + "epoch": 8.91, + "learning_rate": 6.058044519582982e-06, + "loss": 0.453, + "step": 10540, + "task_loss": 1.7708070278167725 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7572463154792786, + "epoch": 8.91, + "learning_rate": 6.053348361040669e-06, + "loss": 0.592, + "step": 10541, + "task_loss": 0.7134084105491638 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.49136799573898315, + "epoch": 8.91, + "learning_rate": 6.048652202498357e-06, + "loss": 0.5125, + "step": 10542, + "task_loss": 0.4384304881095886 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5600651502609253, + "epoch": 8.91, + "learning_rate": 6.043956043956044e-06, + "loss": 0.5287, + "step": 10543, + "task_loss": 0.4968068301677704 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4539504647254944, + "epoch": 8.91, + "learning_rate": 6.0392598854137315e-06, + "loss": 0.6413, + "step": 10544, + "task_loss": 0.7371556758880615 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7250306606292725, + "epoch": 8.91, + "learning_rate": 6.03456372687142e-06, + "loss": 0.4634, + "step": 10545, + "task_loss": 1.3915570974349976 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.36432817578315735, + "epoch": 8.91, + "learning_rate": 6.029867568329107e-06, + "loss": 0.5151, + "step": 10546, + "task_loss": 0.3992324471473694 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3606415390968323, + "epoch": 8.91, + "learning_rate": 6.025171409786794e-06, + "loss": 0.4366, + "step": 10547, + "task_loss": 0.8253019452095032 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.680189847946167, + "epoch": 8.92, + "learning_rate": 6.020475251244483e-06, + "loss": 0.6128, + "step": 10548, + "task_loss": 1.7388219833374023 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7580942511558533, + "epoch": 8.92, + "learning_rate": 6.01577909270217e-06, + "loss": 0.5784, + "step": 10549, + "task_loss": 0.4446659982204437 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7265965342521667, + "epoch": 8.92, + "learning_rate": 6.011082934159857e-06, + "loss": 0.6846, + "step": 10550, + "task_loss": 0.7343148589134216 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7248297333717346, + "epoch": 8.92, + "learning_rate": 6.006386775617545e-06, + "loss": 0.6278, + "step": 10551, + "task_loss": 0.8422881364822388 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4567148983478546, + "epoch": 8.92, + "learning_rate": 6.0016906170752324e-06, + "loss": 0.5052, + "step": 10552, + "task_loss": 0.34851792454719543 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6441633105278015, + "epoch": 8.92, + "learning_rate": 5.9969944585329205e-06, + "loss": 0.8101, + "step": 10553, + "task_loss": 0.43911269307136536 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4971086084842682, + "epoch": 8.92, + "learning_rate": 5.992298299990608e-06, + "loss": 0.5359, + "step": 10554, + "task_loss": 0.5593665838241577 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6054649353027344, + "epoch": 8.92, + "learning_rate": 5.987602141448296e-06, + "loss": 0.5545, + "step": 10555, + "task_loss": 0.5217118859291077 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3951791524887085, + "epoch": 8.92, + "learning_rate": 5.982905982905984e-06, + "loss": 0.5078, + "step": 10556, + "task_loss": 0.9322754740715027 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4569542407989502, + "epoch": 8.92, + "learning_rate": 5.978209824363671e-06, + "loss": 0.6605, + "step": 10557, + "task_loss": 1.0454529523849487 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4214921295642853, + "epoch": 8.92, + "learning_rate": 5.973513665821358e-06, + "loss": 0.4589, + "step": 10558, + "task_loss": 0.8318082094192505 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8459569215774536, + "epoch": 8.93, + "learning_rate": 5.968817507279046e-06, + "loss": 0.6993, + "step": 10559, + "task_loss": 0.8212286233901978 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5788612365722656, + "epoch": 8.93, + "learning_rate": 5.964121348736733e-06, + "loss": 0.5711, + "step": 10560, + "task_loss": 1.0399996042251587 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0869815349578857, + "epoch": 8.93, + "learning_rate": 5.959425190194421e-06, + "loss": 0.6249, + "step": 10561, + "task_loss": 1.0399342775344849 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8058860301971436, + "epoch": 8.93, + "learning_rate": 5.9547290316521085e-06, + "loss": 0.7149, + "step": 10562, + "task_loss": 0.8399339914321899 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3476080894470215, + "epoch": 8.93, + "learning_rate": 5.9500328731097966e-06, + "loss": 0.4805, + "step": 10563, + "task_loss": 0.6809529066085815 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6241342425346375, + "epoch": 8.93, + "learning_rate": 5.945336714567485e-06, + "loss": 0.4459, + "step": 10564, + "task_loss": 1.3063066005706787 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.900982141494751, + "epoch": 8.93, + "learning_rate": 5.940640556025172e-06, + "loss": 0.598, + "step": 10565, + "task_loss": 0.7863163352012634 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4424629211425781, + "epoch": 8.93, + "learning_rate": 5.935944397482859e-06, + "loss": 0.7145, + "step": 10566, + "task_loss": 0.8460557460784912 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.538854718208313, + "epoch": 8.93, + "learning_rate": 5.931248238940547e-06, + "loss": 0.6042, + "step": 10567, + "task_loss": 0.26784974336624146 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9073569178581238, + "epoch": 8.93, + "learning_rate": 5.926552080398234e-06, + "loss": 0.5776, + "step": 10568, + "task_loss": 0.6299015879631042 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8600467443466187, + "epoch": 8.93, + "learning_rate": 5.921855921855922e-06, + "loss": 0.5787, + "step": 10569, + "task_loss": 0.7108116149902344 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7636242508888245, + "epoch": 8.93, + "learning_rate": 5.917159763313609e-06, + "loss": 0.6909, + "step": 10570, + "task_loss": 0.3891683518886566 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4737194776535034, + "epoch": 8.94, + "learning_rate": 5.9124636047712974e-06, + "loss": 0.4325, + "step": 10571, + "task_loss": 1.2649235725402832 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5232113599777222, + "epoch": 8.94, + "learning_rate": 5.9077674462289855e-06, + "loss": 0.5092, + "step": 10572, + "task_loss": 0.3710166811943054 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6406015157699585, + "epoch": 8.94, + "learning_rate": 5.903071287686673e-06, + "loss": 0.582, + "step": 10573, + "task_loss": 1.0622791051864624 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.28076452016830444, + "epoch": 8.94, + "learning_rate": 5.89837512914436e-06, + "loss": 0.4897, + "step": 10574, + "task_loss": 0.22033900022506714 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.73424232006073, + "epoch": 8.94, + "learning_rate": 5.893678970602048e-06, + "loss": 0.4436, + "step": 10575, + "task_loss": 0.33071139454841614 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4246208071708679, + "epoch": 8.94, + "learning_rate": 5.888982812059735e-06, + "loss": 0.5423, + "step": 10576, + "task_loss": 0.22242748737335205 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.38111612200737, + "epoch": 8.94, + "learning_rate": 5.884286653517423e-06, + "loss": 0.6167, + "step": 10577, + "task_loss": 0.30698180198669434 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5411278009414673, + "epoch": 8.94, + "learning_rate": 5.87959049497511e-06, + "loss": 0.7413, + "step": 10578, + "task_loss": 0.8103427886962891 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.43713247776031494, + "epoch": 8.94, + "learning_rate": 5.874894336432798e-06, + "loss": 0.7195, + "step": 10579, + "task_loss": 0.9134210348129272 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3005964457988739, + "epoch": 8.94, + "learning_rate": 5.870198177890486e-06, + "loss": 0.451, + "step": 10580, + "task_loss": 0.2694101631641388 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.49510639905929565, + "epoch": 8.94, + "learning_rate": 5.8655020193481735e-06, + "loss": 0.6588, + "step": 10581, + "task_loss": 0.6259457468986511 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.403123140335083, + "epoch": 8.94, + "learning_rate": 5.860805860805861e-06, + "loss": 0.4654, + "step": 10582, + "task_loss": 0.267294317483902 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.542702317237854, + "epoch": 8.95, + "learning_rate": 5.856109702263549e-06, + "loss": 0.6111, + "step": 10583, + "task_loss": 0.11389429867267609 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.36455023288726807, + "epoch": 8.95, + "learning_rate": 5.851413543721236e-06, + "loss": 0.4608, + "step": 10584, + "task_loss": 0.12865935266017914 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4708772897720337, + "epoch": 8.95, + "learning_rate": 5.846717385178924e-06, + "loss": 0.4848, + "step": 10585, + "task_loss": 0.24009481072425842 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7306826114654541, + "epoch": 8.95, + "learning_rate": 5.842021226636612e-06, + "loss": 0.5548, + "step": 10586, + "task_loss": 0.9158787727355957 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7236275672912598, + "epoch": 8.95, + "learning_rate": 5.837325068094299e-06, + "loss": 0.5932, + "step": 10587, + "task_loss": 1.0569521188735962 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7704841494560242, + "epoch": 8.95, + "learning_rate": 5.832628909551987e-06, + "loss": 0.4546, + "step": 10588, + "task_loss": 0.8241244554519653 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5749755501747131, + "epoch": 8.95, + "learning_rate": 5.8279327510096744e-06, + "loss": 0.5516, + "step": 10589, + "task_loss": 0.8147531747817993 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.48853567242622375, + "epoch": 8.95, + "learning_rate": 5.823236592467362e-06, + "loss": 0.6355, + "step": 10590, + "task_loss": 0.29662421345710754 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3158225417137146, + "epoch": 8.95, + "learning_rate": 5.81854043392505e-06, + "loss": 0.4613, + "step": 10591, + "task_loss": 0.4538353681564331 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8548911809921265, + "epoch": 8.95, + "learning_rate": 5.813844275382737e-06, + "loss": 0.7016, + "step": 10592, + "task_loss": 1.8272366523742676 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3361542820930481, + "epoch": 8.95, + "learning_rate": 5.809148116840425e-06, + "loss": 0.4179, + "step": 10593, + "task_loss": 0.23179684579372406 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.2522212266921997, + "epoch": 8.95, + "learning_rate": 5.804451958298113e-06, + "loss": 0.6995, + "step": 10594, + "task_loss": 1.558007001876831 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7357101440429688, + "epoch": 8.96, + "learning_rate": 5.7997557997558e-06, + "loss": 0.5646, + "step": 10595, + "task_loss": 1.6892472505569458 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.2751644253730774, + "epoch": 8.96, + "learning_rate": 5.795059641213488e-06, + "loss": 0.4064, + "step": 10596, + "task_loss": 0.4793139696121216 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4209744334220886, + "epoch": 8.96, + "learning_rate": 5.790363482671175e-06, + "loss": 0.5497, + "step": 10597, + "task_loss": 0.5476157069206238 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.344992995262146, + "epoch": 8.96, + "learning_rate": 5.7856673241288625e-06, + "loss": 0.4452, + "step": 10598, + "task_loss": 0.5873465538024902 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.27394407987594604, + "epoch": 8.96, + "learning_rate": 5.7809711655865505e-06, + "loss": 0.4575, + "step": 10599, + "task_loss": 0.5968269109725952 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.521414041519165, + "epoch": 8.96, + "learning_rate": 5.776275007044238e-06, + "loss": 0.5233, + "step": 10600, + "task_loss": 0.5214977264404297 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.617308497428894, + "epoch": 8.96, + "learning_rate": 5.771578848501926e-06, + "loss": 0.6645, + "step": 10601, + "task_loss": 0.45544153451919556 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7717319130897522, + "epoch": 8.96, + "learning_rate": 5.766882689959614e-06, + "loss": 0.5364, + "step": 10602, + "task_loss": 1.0179121494293213 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.914208710193634, + "epoch": 8.96, + "learning_rate": 5.762186531417301e-06, + "loss": 0.6355, + "step": 10603, + "task_loss": 0.8773550987243652 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5308414697647095, + "epoch": 8.96, + "learning_rate": 5.757490372874989e-06, + "loss": 0.6215, + "step": 10604, + "task_loss": 0.6074046492576599 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5941941738128662, + "epoch": 8.96, + "learning_rate": 5.752794214332676e-06, + "loss": 0.5265, + "step": 10605, + "task_loss": 1.334627389907837 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6161577105522156, + "epoch": 8.96, + "learning_rate": 5.748098055790363e-06, + "loss": 0.7118, + "step": 10606, + "task_loss": 1.05649995803833 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.42733895778656006, + "epoch": 8.97, + "learning_rate": 5.743401897248051e-06, + "loss": 0.673, + "step": 10607, + "task_loss": 0.5426110029220581 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.47834107279777527, + "epoch": 8.97, + "learning_rate": 5.738705738705739e-06, + "loss": 0.6304, + "step": 10608, + "task_loss": 0.6959487199783325 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.2916804552078247, + "epoch": 8.97, + "learning_rate": 5.734009580163427e-06, + "loss": 0.4511, + "step": 10609, + "task_loss": 0.5602156519889832 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6137771606445312, + "epoch": 8.97, + "learning_rate": 5.729313421621115e-06, + "loss": 0.6402, + "step": 10610, + "task_loss": 0.2187805473804474 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4819800555706024, + "epoch": 8.97, + "learning_rate": 5.724617263078802e-06, + "loss": 0.4629, + "step": 10611, + "task_loss": 1.5057392120361328 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5611621737480164, + "epoch": 8.97, + "learning_rate": 5.71992110453649e-06, + "loss": 0.5181, + "step": 10612, + "task_loss": 0.620520293712616 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3776792287826538, + "epoch": 8.97, + "learning_rate": 5.715224945994177e-06, + "loss": 0.5578, + "step": 10613, + "task_loss": 0.8055549263954163 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4176749289035797, + "epoch": 8.97, + "learning_rate": 5.710528787451864e-06, + "loss": 0.4776, + "step": 10614, + "task_loss": 0.5071150064468384 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7397521734237671, + "epoch": 8.97, + "learning_rate": 5.705832628909552e-06, + "loss": 0.5064, + "step": 10615, + "task_loss": 0.9772310256958008 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6495307683944702, + "epoch": 8.97, + "learning_rate": 5.7011364703672395e-06, + "loss": 0.6405, + "step": 10616, + "task_loss": 0.592419445514679 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7125726342201233, + "epoch": 8.97, + "learning_rate": 5.6964403118249275e-06, + "loss": 0.5695, + "step": 10617, + "task_loss": 0.2861241102218628 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5696787238121033, + "epoch": 8.97, + "learning_rate": 5.6917441532826155e-06, + "loss": 0.4747, + "step": 10618, + "task_loss": 0.8371036052703857 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7022693753242493, + "epoch": 8.98, + "learning_rate": 5.687047994740303e-06, + "loss": 0.6533, + "step": 10619, + "task_loss": 0.2178897261619568 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.773979663848877, + "epoch": 8.98, + "learning_rate": 5.682351836197991e-06, + "loss": 0.5448, + "step": 10620, + "task_loss": 0.45013806223869324 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4895588457584381, + "epoch": 8.98, + "learning_rate": 5.677655677655678e-06, + "loss": 0.719, + "step": 10621, + "task_loss": 0.5551857948303223 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6219139099121094, + "epoch": 8.98, + "learning_rate": 5.672959519113365e-06, + "loss": 0.5339, + "step": 10622, + "task_loss": 0.7996835112571716 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4924367070198059, + "epoch": 8.98, + "learning_rate": 5.668263360571053e-06, + "loss": 0.5726, + "step": 10623, + "task_loss": 0.12421528995037079 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6398635506629944, + "epoch": 8.98, + "learning_rate": 5.66356720202874e-06, + "loss": 0.6373, + "step": 10624, + "task_loss": 0.9792771339416504 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.43375566601753235, + "epoch": 8.98, + "learning_rate": 5.658871043486428e-06, + "loss": 0.4617, + "step": 10625, + "task_loss": 0.6341457962989807 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.39524757862091064, + "epoch": 8.98, + "learning_rate": 5.6541748849441164e-06, + "loss": 0.4117, + "step": 10626, + "task_loss": 0.36518457531929016 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.38290005922317505, + "epoch": 8.98, + "learning_rate": 5.649478726401804e-06, + "loss": 0.5015, + "step": 10627, + "task_loss": 0.21354712545871735 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.42722660303115845, + "epoch": 8.98, + "learning_rate": 5.644782567859491e-06, + "loss": 0.4955, + "step": 10628, + "task_loss": 0.7511085867881775 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5205132365226746, + "epoch": 8.98, + "learning_rate": 5.640086409317179e-06, + "loss": 0.4821, + "step": 10629, + "task_loss": 0.475449800491333 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.37586677074432373, + "epoch": 8.99, + "learning_rate": 5.635390250774866e-06, + "loss": 0.6439, + "step": 10630, + "task_loss": 0.8518162965774536 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7536553144454956, + "epoch": 8.99, + "learning_rate": 5.630694092232554e-06, + "loss": 0.63, + "step": 10631, + "task_loss": 0.44915422797203064 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.757983922958374, + "epoch": 8.99, + "learning_rate": 5.625997933690241e-06, + "loss": 0.6434, + "step": 10632, + "task_loss": 0.7227051258087158 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.46571505069732666, + "epoch": 8.99, + "learning_rate": 5.621301775147929e-06, + "loss": 0.4219, + "step": 10633, + "task_loss": 0.6484419107437134 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.45685186982154846, + "epoch": 8.99, + "learning_rate": 5.616605616605617e-06, + "loss": 0.4767, + "step": 10634, + "task_loss": 0.4895040988922119 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.23881344497203827, + "epoch": 8.99, + "learning_rate": 5.6119094580633045e-06, + "loss": 0.5771, + "step": 10635, + "task_loss": 0.20051783323287964 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5274030566215515, + "epoch": 8.99, + "learning_rate": 5.607213299520992e-06, + "loss": 0.5647, + "step": 10636, + "task_loss": 0.18868543207645416 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5698296427726746, + "epoch": 8.99, + "learning_rate": 5.60251714097868e-06, + "loss": 0.5355, + "step": 10637, + "task_loss": 0.436833918094635 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5624359250068665, + "epoch": 8.99, + "learning_rate": 5.597820982436367e-06, + "loss": 0.5512, + "step": 10638, + "task_loss": 1.210066795349121 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6097291111946106, + "epoch": 8.99, + "learning_rate": 5.593124823894055e-06, + "loss": 0.5343, + "step": 10639, + "task_loss": 1.423251986503601 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5431709289550781, + "epoch": 8.99, + "learning_rate": 5.588428665351743e-06, + "loss": 0.6186, + "step": 10640, + "task_loss": 0.6638689637184143 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.37089869379997253, + "epoch": 8.99, + "learning_rate": 5.58373250680943e-06, + "loss": 0.4626, + "step": 10641, + "task_loss": 1.4731700420379639 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5372847318649292, + "epoch": 9.0, + "learning_rate": 5.579036348267118e-06, + "loss": 0.5969, + "step": 10642, + "task_loss": 0.14681988954544067 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5942049026489258, + "epoch": 9.0, + "learning_rate": 5.574340189724805e-06, + "loss": 0.5654, + "step": 10643, + "task_loss": 0.41353386640548706 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4840534031391144, + "epoch": 9.0, + "learning_rate": 5.5696440311824926e-06, + "loss": 0.5238, + "step": 10644, + "task_loss": 0.6436223387718201 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0348104238510132, + "epoch": 9.0, + "learning_rate": 5.564947872640181e-06, + "loss": 0.5447, + "step": 10645, + "task_loss": 0.5985457897186279 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.49458739161491394, + "epoch": 9.0, + "learning_rate": 5.560251714097868e-06, + "loss": 0.5415, + "step": 10646, + "task_loss": 0.5283011794090271 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3433035612106323, + "epoch": 9.0, + "learning_rate": 5.555555555555556e-06, + "loss": 0.482, + "step": 10647, + "task_loss": 0.7570391893386841 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.407757967710495, + "epoch": 9.0, + "learning_rate": 5.550859397013244e-06, + "loss": 0.8095, + "step": 10648, + "task_loss": 0.23411524295806885 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5599245429039001, + "epoch": 9.0, + "learning_rate": 5.546163238470931e-06, + "loss": 0.5671, + "step": 10649, + "task_loss": 0.6180043816566467 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8195317983627319, + "epoch": 9.0, + "learning_rate": 5.541467079928619e-06, + "loss": 0.5945, + "step": 10650, + "task_loss": 0.719485342502594 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.35114070773124695, + "epoch": 9.0, + "learning_rate": 5.536770921386306e-06, + "loss": 0.5078, + "step": 10651, + "task_loss": 0.8089998364448547 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5093770623207092, + "epoch": 9.0, + "learning_rate": 5.5320747628439934e-06, + "loss": 0.5048, + "step": 10652, + "task_loss": 0.2606903612613678 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.34760093688964844, + "epoch": 9.01, + "learning_rate": 5.5273786043016815e-06, + "loss": 0.45, + "step": 10653, + "task_loss": 0.07798047363758087 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.34830141067504883, + "epoch": 9.01, + "learning_rate": 5.522682445759369e-06, + "loss": 0.6918, + "step": 10654, + "task_loss": 0.45146510004997253 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.22173309326171875, + "epoch": 9.01, + "learning_rate": 5.517986287217057e-06, + "loss": 0.3804, + "step": 10655, + "task_loss": 0.036402247846126556 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.39955711364746094, + "epoch": 9.01, + "learning_rate": 5.513290128674745e-06, + "loss": 0.6095, + "step": 10656, + "task_loss": 0.3125511109828949 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9853004813194275, + "epoch": 9.01, + "learning_rate": 5.508593970132432e-06, + "loss": 0.7771, + "step": 10657, + "task_loss": 1.1309833526611328 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5019111633300781, + "epoch": 9.01, + "learning_rate": 5.50389781159012e-06, + "loss": 0.4234, + "step": 10658, + "task_loss": 0.544341504573822 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5345777869224548, + "epoch": 9.01, + "learning_rate": 5.499201653047807e-06, + "loss": 0.5498, + "step": 10659, + "task_loss": 0.8023315072059631 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.44157788157463074, + "epoch": 9.01, + "learning_rate": 5.494505494505494e-06, + "loss": 0.4246, + "step": 10660, + "task_loss": 1.2218419313430786 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6741924285888672, + "epoch": 9.01, + "learning_rate": 5.489809335963182e-06, + "loss": 0.5145, + "step": 10661, + "task_loss": 1.5363187789916992 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5180450677871704, + "epoch": 9.01, + "learning_rate": 5.4851131774208696e-06, + "loss": 0.5938, + "step": 10662, + "task_loss": 0.9459388852119446 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7340129017829895, + "epoch": 9.01, + "learning_rate": 5.480417018878558e-06, + "loss": 0.5409, + "step": 10663, + "task_loss": 0.998272180557251 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.38078004121780396, + "epoch": 9.01, + "learning_rate": 5.475720860336246e-06, + "loss": 0.6563, + "step": 10664, + "task_loss": 0.15468423068523407 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.441752552986145, + "epoch": 9.02, + "learning_rate": 5.471024701793933e-06, + "loss": 0.5418, + "step": 10665, + "task_loss": 1.0044411420822144 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5145384073257446, + "epoch": 9.02, + "learning_rate": 5.466328543251621e-06, + "loss": 0.5232, + "step": 10666, + "task_loss": 0.5512675046920776 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5974806547164917, + "epoch": 9.02, + "learning_rate": 5.461632384709308e-06, + "loss": 0.4971, + "step": 10667, + "task_loss": 1.0341695547103882 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5553049445152283, + "epoch": 9.02, + "learning_rate": 5.456936226166995e-06, + "loss": 0.5026, + "step": 10668, + "task_loss": 0.6006874442100525 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5564650893211365, + "epoch": 9.02, + "learning_rate": 5.452240067624683e-06, + "loss": 0.6138, + "step": 10669, + "task_loss": 0.7951217889785767 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7239872217178345, + "epoch": 9.02, + "learning_rate": 5.4475439090823704e-06, + "loss": 0.5086, + "step": 10670, + "task_loss": 0.6583983302116394 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4491655230522156, + "epoch": 9.02, + "learning_rate": 5.4428477505400585e-06, + "loss": 0.5441, + "step": 10671, + "task_loss": 0.5809100866317749 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.528498113155365, + "epoch": 9.02, + "learning_rate": 5.4381515919977465e-06, + "loss": 0.4721, + "step": 10672, + "task_loss": 1.106186866760254 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4860708713531494, + "epoch": 9.02, + "learning_rate": 5.433455433455434e-06, + "loss": 0.659, + "step": 10673, + "task_loss": 0.3552553057670593 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7622365951538086, + "epoch": 9.02, + "learning_rate": 5.428759274913122e-06, + "loss": 0.5915, + "step": 10674, + "task_loss": 1.056890845298767 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.47468799352645874, + "epoch": 9.02, + "learning_rate": 5.424063116370809e-06, + "loss": 0.5797, + "step": 10675, + "task_loss": 0.1666695773601532 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.36272135376930237, + "epoch": 9.02, + "learning_rate": 5.419366957828496e-06, + "loss": 0.3778, + "step": 10676, + "task_loss": 0.4107912480831146 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.2222643345594406, + "epoch": 9.03, + "learning_rate": 5.414670799286184e-06, + "loss": 0.3718, + "step": 10677, + "task_loss": 0.06729756295681 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8407806158065796, + "epoch": 9.03, + "learning_rate": 5.409974640743871e-06, + "loss": 0.5842, + "step": 10678, + "task_loss": 0.739843487739563 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5900310277938843, + "epoch": 9.03, + "learning_rate": 5.405278482201559e-06, + "loss": 0.4986, + "step": 10679, + "task_loss": 0.6322044134140015 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.48569199442863464, + "epoch": 9.03, + "learning_rate": 5.400582323659247e-06, + "loss": 0.4689, + "step": 10680, + "task_loss": 0.31120049953460693 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.32002347707748413, + "epoch": 9.03, + "learning_rate": 5.3958861651169346e-06, + "loss": 0.5822, + "step": 10681, + "task_loss": 0.77793288230896 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.776505708694458, + "epoch": 9.03, + "learning_rate": 5.391190006574623e-06, + "loss": 0.7033, + "step": 10682, + "task_loss": 1.2588181495666504 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5574590563774109, + "epoch": 9.03, + "learning_rate": 5.38649384803231e-06, + "loss": 0.4978, + "step": 10683, + "task_loss": 0.3758609890937805 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5000337958335876, + "epoch": 9.03, + "learning_rate": 5.381797689489997e-06, + "loss": 0.5536, + "step": 10684, + "task_loss": 0.615101158618927 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3912767767906189, + "epoch": 9.03, + "learning_rate": 5.377101530947685e-06, + "loss": 0.377, + "step": 10685, + "task_loss": 0.5905370712280273 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6533083319664001, + "epoch": 9.03, + "learning_rate": 5.372405372405373e-06, + "loss": 0.5492, + "step": 10686, + "task_loss": 1.3165231943130493 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4646679162979126, + "epoch": 9.03, + "learning_rate": 5.36770921386306e-06, + "loss": 0.6037, + "step": 10687, + "task_loss": 0.6696792840957642 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4812178611755371, + "epoch": 9.03, + "learning_rate": 5.363013055320748e-06, + "loss": 0.5397, + "step": 10688, + "task_loss": 0.9114646315574646 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.35127007961273193, + "epoch": 9.04, + "learning_rate": 5.3583168967784355e-06, + "loss": 0.4817, + "step": 10689, + "task_loss": 0.2912335991859436 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4510999321937561, + "epoch": 9.04, + "learning_rate": 5.3536207382361235e-06, + "loss": 0.5138, + "step": 10690, + "task_loss": 1.0328394174575806 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7420287132263184, + "epoch": 9.04, + "learning_rate": 5.348924579693811e-06, + "loss": 0.6592, + "step": 10691, + "task_loss": 1.5977141857147217 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.584267258644104, + "epoch": 9.04, + "learning_rate": 5.344228421151498e-06, + "loss": 0.501, + "step": 10692, + "task_loss": 0.8599151372909546 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4502565562725067, + "epoch": 9.04, + "learning_rate": 5.339532262609186e-06, + "loss": 0.574, + "step": 10693, + "task_loss": 1.053470253944397 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.34601396322250366, + "epoch": 9.04, + "learning_rate": 5.334836104066874e-06, + "loss": 0.4976, + "step": 10694, + "task_loss": 0.870319128036499 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.30170923471450806, + "epoch": 9.04, + "learning_rate": 5.330139945524561e-06, + "loss": 0.4996, + "step": 10695, + "task_loss": 1.048630714416504 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.32882335782051086, + "epoch": 9.04, + "learning_rate": 5.325443786982249e-06, + "loss": 0.5102, + "step": 10696, + "task_loss": 0.5748003721237183 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.2617471218109131, + "epoch": 9.04, + "learning_rate": 5.320747628439936e-06, + "loss": 0.5094, + "step": 10697, + "task_loss": 0.1973281353712082 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7230280637741089, + "epoch": 9.04, + "learning_rate": 5.316051469897624e-06, + "loss": 0.636, + "step": 10698, + "task_loss": 1.0312447547912598 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.499292254447937, + "epoch": 9.04, + "learning_rate": 5.3113553113553116e-06, + "loss": 0.4413, + "step": 10699, + "task_loss": 0.5267291069030762 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7206746935844421, + "epoch": 9.04, + "learning_rate": 5.306659152812999e-06, + "loss": 0.5219, + "step": 10700, + "task_loss": 0.582303524017334 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.18800894916057587, + "epoch": 9.05, + "learning_rate": 5.301962994270687e-06, + "loss": 0.3929, + "step": 10701, + "task_loss": 0.00767991878092289 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6911872625350952, + "epoch": 9.05, + "learning_rate": 5.297266835728375e-06, + "loss": 0.6601, + "step": 10702, + "task_loss": 0.403439998626709 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6180187463760376, + "epoch": 9.05, + "learning_rate": 5.292570677186062e-06, + "loss": 0.4926, + "step": 10703, + "task_loss": 0.4618394076824188 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8982678055763245, + "epoch": 9.05, + "learning_rate": 5.28787451864375e-06, + "loss": 0.7141, + "step": 10704, + "task_loss": 1.3920810222625732 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.42901918292045593, + "epoch": 9.05, + "learning_rate": 5.283178360101437e-06, + "loss": 0.4633, + "step": 10705, + "task_loss": 0.6130470037460327 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5027579069137573, + "epoch": 9.05, + "learning_rate": 5.278482201559124e-06, + "loss": 0.5714, + "step": 10706, + "task_loss": 0.5893825888633728 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5195333957672119, + "epoch": 9.05, + "learning_rate": 5.2737860430168124e-06, + "loss": 0.657, + "step": 10707, + "task_loss": 1.0011210441589355 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3038981556892395, + "epoch": 9.05, + "learning_rate": 5.2690898844745e-06, + "loss": 0.4822, + "step": 10708, + "task_loss": 1.102056860923767 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4899473190307617, + "epoch": 9.05, + "learning_rate": 5.264393725932188e-06, + "loss": 0.489, + "step": 10709, + "task_loss": 0.2596272826194763 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5682339668273926, + "epoch": 9.05, + "learning_rate": 5.259697567389876e-06, + "loss": 0.6632, + "step": 10710, + "task_loss": 0.5928970575332642 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5781933069229126, + "epoch": 9.05, + "learning_rate": 5.255001408847563e-06, + "loss": 0.5011, + "step": 10711, + "task_loss": 0.5947277545928955 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.524718701839447, + "epoch": 9.05, + "learning_rate": 5.250305250305251e-06, + "loss": 0.5594, + "step": 10712, + "task_loss": 0.6361549496650696 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5746286511421204, + "epoch": 9.06, + "learning_rate": 5.245609091762938e-06, + "loss": 0.5644, + "step": 10713, + "task_loss": 0.21720650792121887 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4651991128921509, + "epoch": 9.06, + "learning_rate": 5.240912933220625e-06, + "loss": 0.5575, + "step": 10714, + "task_loss": 0.9711751937866211 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8578423261642456, + "epoch": 9.06, + "learning_rate": 5.236216774678313e-06, + "loss": 0.5909, + "step": 10715, + "task_loss": 1.534393310546875 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.23297281563282013, + "epoch": 9.06, + "learning_rate": 5.2315206161360005e-06, + "loss": 0.445, + "step": 10716, + "task_loss": 0.3661234974861145 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.48204970359802246, + "epoch": 9.06, + "learning_rate": 5.2268244575936885e-06, + "loss": 0.6647, + "step": 10717, + "task_loss": 0.7805172801017761 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4789668023586273, + "epoch": 9.06, + "learning_rate": 5.2221282990513766e-06, + "loss": 0.5711, + "step": 10718, + "task_loss": 0.20756149291992188 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9267908334732056, + "epoch": 9.06, + "learning_rate": 5.217432140509064e-06, + "loss": 0.864, + "step": 10719, + "task_loss": 0.9171956777572632 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6839046478271484, + "epoch": 9.06, + "learning_rate": 5.212735981966752e-06, + "loss": 0.4646, + "step": 10720, + "task_loss": 0.512065589427948 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3602970242500305, + "epoch": 9.06, + "learning_rate": 5.208039823424439e-06, + "loss": 0.5157, + "step": 10721, + "task_loss": 0.511012852191925 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3568427562713623, + "epoch": 9.06, + "learning_rate": 5.203343664882126e-06, + "loss": 0.4484, + "step": 10722, + "task_loss": 0.04860250651836395 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7669222354888916, + "epoch": 9.06, + "learning_rate": 5.198647506339814e-06, + "loss": 0.5706, + "step": 10723, + "task_loss": 0.23311369121074677 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.30617988109588623, + "epoch": 9.07, + "learning_rate": 5.193951347797501e-06, + "loss": 0.6441, + "step": 10724, + "task_loss": 0.2729555666446686 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.47705668210983276, + "epoch": 9.07, + "learning_rate": 5.189255189255189e-06, + "loss": 0.5855, + "step": 10725, + "task_loss": 0.9603689312934875 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4342404901981354, + "epoch": 9.07, + "learning_rate": 5.1845590307128775e-06, + "loss": 0.5594, + "step": 10726, + "task_loss": 0.7274681925773621 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7334962487220764, + "epoch": 9.07, + "learning_rate": 5.179862872170565e-06, + "loss": 0.7331, + "step": 10727, + "task_loss": 1.028825283050537 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.36315786838531494, + "epoch": 9.07, + "learning_rate": 5.175166713628253e-06, + "loss": 0.6233, + "step": 10728, + "task_loss": 0.48624899983406067 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6651328802108765, + "epoch": 9.07, + "learning_rate": 5.17047055508594e-06, + "loss": 0.6292, + "step": 10729, + "task_loss": 0.8246500492095947 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5990895628929138, + "epoch": 9.07, + "learning_rate": 5.165774396543627e-06, + "loss": 0.5885, + "step": 10730, + "task_loss": 1.2375344038009644 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4557642340660095, + "epoch": 9.07, + "learning_rate": 5.161078238001315e-06, + "loss": 0.4836, + "step": 10731, + "task_loss": 0.5624407529830933 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.25203099846839905, + "epoch": 9.07, + "learning_rate": 5.156382079459002e-06, + "loss": 0.5009, + "step": 10732, + "task_loss": 0.17915724217891693 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7802634239196777, + "epoch": 9.07, + "learning_rate": 5.15168592091669e-06, + "loss": 0.5887, + "step": 10733, + "task_loss": 0.898050844669342 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6418172121047974, + "epoch": 9.07, + "learning_rate": 5.146989762374378e-06, + "loss": 0.5334, + "step": 10734, + "task_loss": 0.6073089838027954 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.2364717423915863, + "epoch": 9.07, + "learning_rate": 5.1422936038320655e-06, + "loss": 0.355, + "step": 10735, + "task_loss": 0.016945166513323784 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7723769545555115, + "epoch": 9.08, + "learning_rate": 5.1375974452897536e-06, + "loss": 0.5572, + "step": 10736, + "task_loss": 0.9320164918899536 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.34681376814842224, + "epoch": 9.08, + "learning_rate": 5.132901286747441e-06, + "loss": 0.4558, + "step": 10737, + "task_loss": 0.9558616280555725 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.543147087097168, + "epoch": 9.08, + "learning_rate": 5.128205128205128e-06, + "loss": 0.4375, + "step": 10738, + "task_loss": 1.7354140281677246 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.44907259941101074, + "epoch": 9.08, + "learning_rate": 5.123508969662816e-06, + "loss": 0.6413, + "step": 10739, + "task_loss": 0.891435444355011 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4230945110321045, + "epoch": 9.08, + "learning_rate": 5.118812811120504e-06, + "loss": 0.4144, + "step": 10740, + "task_loss": 0.45406728982925415 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4339078664779663, + "epoch": 9.08, + "learning_rate": 5.114116652578191e-06, + "loss": 0.6375, + "step": 10741, + "task_loss": 0.813127875328064 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.31925180554389954, + "epoch": 9.08, + "learning_rate": 5.109420494035879e-06, + "loss": 0.4648, + "step": 10742, + "task_loss": 0.7864236235618591 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3511282801628113, + "epoch": 9.08, + "learning_rate": 5.104724335493566e-06, + "loss": 0.7055, + "step": 10743, + "task_loss": 0.30674657225608826 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5844024419784546, + "epoch": 9.08, + "learning_rate": 5.1000281769512544e-06, + "loss": 0.5513, + "step": 10744, + "task_loss": 0.8306488990783691 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9641013145446777, + "epoch": 9.08, + "learning_rate": 5.095332018408942e-06, + "loss": 0.6464, + "step": 10745, + "task_loss": 0.8551418781280518 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.2794591188430786, + "epoch": 9.08, + "learning_rate": 5.090635859866629e-06, + "loss": 0.4101, + "step": 10746, + "task_loss": 0.3487391769886017 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4486106038093567, + "epoch": 9.08, + "learning_rate": 5.085939701324317e-06, + "loss": 0.4133, + "step": 10747, + "task_loss": 1.3279588222503662 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6490987539291382, + "epoch": 9.09, + "learning_rate": 5.081243542782005e-06, + "loss": 0.5962, + "step": 10748, + "task_loss": 0.41047894954681396 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3758861720561981, + "epoch": 9.09, + "learning_rate": 5.076547384239692e-06, + "loss": 0.4599, + "step": 10749, + "task_loss": 0.7316484451293945 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.33905941247940063, + "epoch": 9.09, + "learning_rate": 5.07185122569738e-06, + "loss": 0.4225, + "step": 10750, + "task_loss": 0.2673392593860626 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.34627819061279297, + "epoch": 9.09, + "learning_rate": 5.067155067155067e-06, + "loss": 0.6528, + "step": 10751, + "task_loss": 0.5335155725479126 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5842825174331665, + "epoch": 9.09, + "learning_rate": 5.062458908612755e-06, + "loss": 0.51, + "step": 10752, + "task_loss": 0.533616840839386 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5547326803207397, + "epoch": 9.09, + "learning_rate": 5.0577627500704425e-06, + "loss": 0.4768, + "step": 10753, + "task_loss": 0.2990473508834839 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6757566928863525, + "epoch": 9.09, + "learning_rate": 5.05306659152813e-06, + "loss": 0.4764, + "step": 10754, + "task_loss": 0.9881812334060669 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.19561654329299927, + "epoch": 9.09, + "learning_rate": 5.048370432985818e-06, + "loss": 0.5194, + "step": 10755, + "task_loss": 0.40456876158714294 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4779481291770935, + "epoch": 9.09, + "learning_rate": 5.043674274443506e-06, + "loss": 0.5103, + "step": 10756, + "task_loss": 0.33315205574035645 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.34403663873672485, + "epoch": 9.09, + "learning_rate": 5.038978115901193e-06, + "loss": 0.4056, + "step": 10757, + "task_loss": 0.36416274309158325 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.39299169182777405, + "epoch": 9.09, + "learning_rate": 5.034281957358881e-06, + "loss": 0.5137, + "step": 10758, + "task_loss": 0.9803383946418762 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.39156779646873474, + "epoch": 9.09, + "learning_rate": 5.029585798816568e-06, + "loss": 0.5393, + "step": 10759, + "task_loss": 0.7420110106468201 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4752480089664459, + "epoch": 9.1, + "learning_rate": 5.024889640274256e-06, + "loss": 0.5198, + "step": 10760, + "task_loss": 0.03501179814338684 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.44035688042640686, + "epoch": 9.1, + "learning_rate": 5.020193481731943e-06, + "loss": 0.6122, + "step": 10761, + "task_loss": 0.19357067346572876 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4493807852268219, + "epoch": 9.1, + "learning_rate": 5.0154973231896306e-06, + "loss": 0.4867, + "step": 10762, + "task_loss": 0.7728389501571655 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.31514665484428406, + "epoch": 9.1, + "learning_rate": 5.0108011646473195e-06, + "loss": 0.4769, + "step": 10763, + "task_loss": 0.6905890703201294 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.49011969566345215, + "epoch": 9.1, + "learning_rate": 5.006105006105007e-06, + "loss": 0.4738, + "step": 10764, + "task_loss": 0.30710792541503906 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.45383813977241516, + "epoch": 9.1, + "learning_rate": 5.001408847562694e-06, + "loss": 0.7214, + "step": 10765, + "task_loss": 1.0173850059509277 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4809972047805786, + "epoch": 9.1, + "learning_rate": 4.996712689020382e-06, + "loss": 0.6575, + "step": 10766, + "task_loss": 0.5342851281166077 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5289740562438965, + "epoch": 9.1, + "learning_rate": 4.992016530478069e-06, + "loss": 0.5506, + "step": 10767, + "task_loss": 0.7263054251670837 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.670602023601532, + "epoch": 9.1, + "learning_rate": 4.987320371935757e-06, + "loss": 0.5047, + "step": 10768, + "task_loss": 0.40711352229118347 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7047711610794067, + "epoch": 9.1, + "learning_rate": 4.982624213393444e-06, + "loss": 0.5557, + "step": 10769, + "task_loss": 0.7503254413604736 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.34603285789489746, + "epoch": 9.1, + "learning_rate": 4.9779280548511315e-06, + "loss": 0.4481, + "step": 10770, + "task_loss": 0.1292446106672287 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.37457287311553955, + "epoch": 9.1, + "learning_rate": 4.97323189630882e-06, + "loss": 0.476, + "step": 10771, + "task_loss": 0.40658149123191833 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4471898674964905, + "epoch": 9.11, + "learning_rate": 4.9685357377665075e-06, + "loss": 0.3963, + "step": 10772, + "task_loss": 0.7775158286094666 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.662043571472168, + "epoch": 9.11, + "learning_rate": 4.963839579224195e-06, + "loss": 0.5034, + "step": 10773, + "task_loss": 1.457747220993042 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7967023253440857, + "epoch": 9.11, + "learning_rate": 4.959143420681883e-06, + "loss": 0.6398, + "step": 10774, + "task_loss": 0.7316052913665771 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5361703038215637, + "epoch": 9.11, + "learning_rate": 4.95444726213957e-06, + "loss": 0.8014, + "step": 10775, + "task_loss": 0.8551503419876099 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9573431611061096, + "epoch": 9.11, + "learning_rate": 4.949751103597258e-06, + "loss": 0.5834, + "step": 10776, + "task_loss": 0.6662958264350891 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.3358168601989746, + "epoch": 9.11, + "learning_rate": 4.945054945054945e-06, + "loss": 0.734, + "step": 10777, + "task_loss": 1.679290771484375 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.42570048570632935, + "epoch": 9.11, + "learning_rate": 4.940358786512632e-06, + "loss": 0.658, + "step": 10778, + "task_loss": 0.810636579990387 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6662504076957703, + "epoch": 9.11, + "learning_rate": 4.935662627970321e-06, + "loss": 0.6606, + "step": 10779, + "task_loss": 0.9691312313079834 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4030231237411499, + "epoch": 9.11, + "learning_rate": 4.930966469428008e-06, + "loss": 0.5056, + "step": 10780, + "task_loss": 1.1635175943374634 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.44069385528564453, + "epoch": 9.11, + "learning_rate": 4.926270310885696e-06, + "loss": 0.5268, + "step": 10781, + "task_loss": 0.9235860705375671 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6784002780914307, + "epoch": 9.11, + "learning_rate": 4.921574152343384e-06, + "loss": 0.4624, + "step": 10782, + "task_loss": 1.0753692388534546 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5162583589553833, + "epoch": 9.11, + "learning_rate": 4.916877993801071e-06, + "loss": 0.4642, + "step": 10783, + "task_loss": 0.7312564849853516 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6078613996505737, + "epoch": 9.12, + "learning_rate": 4.912181835258758e-06, + "loss": 0.4801, + "step": 10784, + "task_loss": 0.34435930848121643 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6030712127685547, + "epoch": 9.12, + "learning_rate": 4.907485676716446e-06, + "loss": 0.6043, + "step": 10785, + "task_loss": 1.0006641149520874 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.37262052297592163, + "epoch": 9.12, + "learning_rate": 4.902789518174133e-06, + "loss": 0.4406, + "step": 10786, + "task_loss": 0.2642630934715271 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6639752388000488, + "epoch": 9.12, + "learning_rate": 4.898093359631821e-06, + "loss": 0.5826, + "step": 10787, + "task_loss": 0.7475026249885559 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6616700291633606, + "epoch": 9.12, + "learning_rate": 4.893397201089509e-06, + "loss": 0.5987, + "step": 10788, + "task_loss": 0.4118358790874481 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7351030111312866, + "epoch": 9.12, + "learning_rate": 4.8887010425471965e-06, + "loss": 0.6436, + "step": 10789, + "task_loss": 0.9862117767333984 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4709773361682892, + "epoch": 9.12, + "learning_rate": 4.8840048840048845e-06, + "loss": 0.4338, + "step": 10790, + "task_loss": 0.7355362772941589 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5232301354408264, + "epoch": 9.12, + "learning_rate": 4.879308725462572e-06, + "loss": 0.4295, + "step": 10791, + "task_loss": 0.34780845046043396 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5388599038124084, + "epoch": 9.12, + "learning_rate": 4.874612566920259e-06, + "loss": 0.5009, + "step": 10792, + "task_loss": 1.0823311805725098 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5338149070739746, + "epoch": 9.12, + "learning_rate": 4.869916408377947e-06, + "loss": 0.5472, + "step": 10793, + "task_loss": 1.07578444480896 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4928658604621887, + "epoch": 9.12, + "learning_rate": 4.865220249835635e-06, + "loss": 0.5866, + "step": 10794, + "task_loss": 0.4112739562988281 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.2564661502838135, + "epoch": 9.13, + "learning_rate": 4.860524091293322e-06, + "loss": 0.4326, + "step": 10795, + "task_loss": 1.4365452527999878 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.35808420181274414, + "epoch": 9.13, + "learning_rate": 4.85582793275101e-06, + "loss": 0.6037, + "step": 10796, + "task_loss": 0.5430862903594971 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6631271243095398, + "epoch": 9.13, + "learning_rate": 4.851131774208697e-06, + "loss": 0.5459, + "step": 10797, + "task_loss": 0.49775949120521545 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5071085095405579, + "epoch": 9.13, + "learning_rate": 4.846435615666385e-06, + "loss": 0.7329, + "step": 10798, + "task_loss": 0.7601740956306458 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.36447539925575256, + "epoch": 9.13, + "learning_rate": 4.841739457124073e-06, + "loss": 0.4384, + "step": 10799, + "task_loss": 0.886769711971283 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3594026267528534, + "epoch": 9.13, + "learning_rate": 4.83704329858176e-06, + "loss": 0.5511, + "step": 10800, + "task_loss": 0.39929890632629395 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.46356400847435, + "epoch": 9.13, + "learning_rate": 4.832347140039448e-06, + "loss": 0.4963, + "step": 10801, + "task_loss": 0.8840184807777405 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4519862234592438, + "epoch": 9.13, + "learning_rate": 4.827650981497136e-06, + "loss": 0.5311, + "step": 10802, + "task_loss": 1.7646710872650146 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.2951430082321167, + "epoch": 9.13, + "learning_rate": 4.822954822954823e-06, + "loss": 0.5345, + "step": 10803, + "task_loss": 0.7790025472640991 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5372967720031738, + "epoch": 9.13, + "learning_rate": 4.818258664412511e-06, + "loss": 0.497, + "step": 10804, + "task_loss": 0.3373531401157379 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7428412437438965, + "epoch": 9.13, + "learning_rate": 4.813562505870198e-06, + "loss": 0.6931, + "step": 10805, + "task_loss": 0.7620967626571655 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5248833894729614, + "epoch": 9.13, + "learning_rate": 4.808866347327886e-06, + "loss": 0.6821, + "step": 10806, + "task_loss": 0.9320053458213806 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5163807272911072, + "epoch": 9.14, + "learning_rate": 4.8041701887855735e-06, + "loss": 0.4376, + "step": 10807, + "task_loss": 0.537032961845398 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.584462583065033, + "epoch": 9.14, + "learning_rate": 4.799474030243261e-06, + "loss": 0.632, + "step": 10808, + "task_loss": 0.5542635321617126 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4270203709602356, + "epoch": 9.14, + "learning_rate": 4.794777871700949e-06, + "loss": 0.4892, + "step": 10809, + "task_loss": 0.41152963042259216 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6766055822372437, + "epoch": 9.14, + "learning_rate": 4.790081713158637e-06, + "loss": 0.6922, + "step": 10810, + "task_loss": 1.0806504487991333 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.43792641162872314, + "epoch": 9.14, + "learning_rate": 4.785385554616324e-06, + "loss": 0.529, + "step": 10811, + "task_loss": 0.8878472447395325 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.2180333435535431, + "epoch": 9.14, + "learning_rate": 4.780689396074012e-06, + "loss": 0.4153, + "step": 10812, + "task_loss": 0.16479995846748352 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4482658803462982, + "epoch": 9.14, + "learning_rate": 4.775993237531699e-06, + "loss": 0.4564, + "step": 10813, + "task_loss": 0.1626623123884201 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.504636824131012, + "epoch": 9.14, + "learning_rate": 4.771297078989387e-06, + "loss": 0.4248, + "step": 10814, + "task_loss": 0.17351432144641876 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.548913300037384, + "epoch": 9.14, + "learning_rate": 4.766600920447074e-06, + "loss": 0.4179, + "step": 10815, + "task_loss": 1.243747591972351 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5018069744110107, + "epoch": 9.14, + "learning_rate": 4.7619047619047615e-06, + "loss": 0.4569, + "step": 10816, + "task_loss": 0.6563104391098022 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7317501306533813, + "epoch": 9.14, + "learning_rate": 4.75720860336245e-06, + "loss": 0.5551, + "step": 10817, + "task_loss": 1.1124718189239502 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.42188531160354614, + "epoch": 9.14, + "learning_rate": 4.752512444820138e-06, + "loss": 0.6695, + "step": 10818, + "task_loss": 0.5033867955207825 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.20971906185150146, + "epoch": 9.15, + "learning_rate": 4.747816286277825e-06, + "loss": 0.4099, + "step": 10819, + "task_loss": 0.20005862414836884 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5090563297271729, + "epoch": 9.15, + "learning_rate": 4.743120127735513e-06, + "loss": 0.7195, + "step": 10820, + "task_loss": 0.49531421065330505 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.40505844354629517, + "epoch": 9.15, + "learning_rate": 4.7384239691932e-06, + "loss": 0.4571, + "step": 10821, + "task_loss": 0.26272881031036377 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5439514517784119, + "epoch": 9.15, + "learning_rate": 4.733727810650888e-06, + "loss": 0.5338, + "step": 10822, + "task_loss": 0.5342209339141846 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.48501870036125183, + "epoch": 9.15, + "learning_rate": 4.729031652108575e-06, + "loss": 0.5947, + "step": 10823, + "task_loss": 0.5919874906539917 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.508482038974762, + "epoch": 9.15, + "learning_rate": 4.724335493566262e-06, + "loss": 0.3929, + "step": 10824, + "task_loss": 0.2973504662513733 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9996296763420105, + "epoch": 9.15, + "learning_rate": 4.719639335023951e-06, + "loss": 0.6826, + "step": 10825, + "task_loss": 0.7748823761940002 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.26406335830688477, + "epoch": 9.15, + "learning_rate": 4.7149431764816385e-06, + "loss": 0.515, + "step": 10826, + "task_loss": 0.181888610124588 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6267328262329102, + "epoch": 9.15, + "learning_rate": 4.710247017939326e-06, + "loss": 0.5668, + "step": 10827, + "task_loss": 1.2325279712677002 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.34023815393447876, + "epoch": 9.15, + "learning_rate": 4.705550859397014e-06, + "loss": 0.5307, + "step": 10828, + "task_loss": 0.2356719970703125 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.48738422989845276, + "epoch": 9.15, + "learning_rate": 4.700854700854701e-06, + "loss": 0.5421, + "step": 10829, + "task_loss": 0.19997376203536987 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7565172910690308, + "epoch": 9.15, + "learning_rate": 4.696158542312389e-06, + "loss": 0.624, + "step": 10830, + "task_loss": 2.2238430976867676 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.26180344820022583, + "epoch": 9.16, + "learning_rate": 4.691462383770076e-06, + "loss": 0.5469, + "step": 10831, + "task_loss": 0.5909162759780884 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.513268768787384, + "epoch": 9.16, + "learning_rate": 4.686766225227763e-06, + "loss": 0.6554, + "step": 10832, + "task_loss": 0.7978126406669617 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5059584379196167, + "epoch": 9.16, + "learning_rate": 4.682070066685452e-06, + "loss": 0.58, + "step": 10833, + "task_loss": 0.7370092868804932 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5216562151908875, + "epoch": 9.16, + "learning_rate": 4.677373908143139e-06, + "loss": 0.4027, + "step": 10834, + "task_loss": 0.7828550338745117 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4096694886684418, + "epoch": 9.16, + "learning_rate": 4.6726777496008265e-06, + "loss": 0.6327, + "step": 10835, + "task_loss": 0.46410971879959106 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.493735671043396, + "epoch": 9.16, + "learning_rate": 4.667981591058515e-06, + "loss": 0.7325, + "step": 10836, + "task_loss": 1.030170202255249 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9138497114181519, + "epoch": 9.16, + "learning_rate": 4.663285432516202e-06, + "loss": 0.5628, + "step": 10837, + "task_loss": 1.2414358854293823 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3063437342643738, + "epoch": 9.16, + "learning_rate": 4.65858927397389e-06, + "loss": 0.6204, + "step": 10838, + "task_loss": 0.22206629812717438 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8960245847702026, + "epoch": 9.16, + "learning_rate": 4.653893115431577e-06, + "loss": 0.6959, + "step": 10839, + "task_loss": 0.5248209834098816 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.730666995048523, + "epoch": 9.16, + "learning_rate": 4.649196956889264e-06, + "loss": 0.5553, + "step": 10840, + "task_loss": 0.4806661605834961 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.477531373500824, + "epoch": 9.16, + "learning_rate": 4.644500798346953e-06, + "loss": 0.573, + "step": 10841, + "task_loss": 0.6099678874015808 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.2639722228050232, + "epoch": 9.16, + "learning_rate": 4.63980463980464e-06, + "loss": 0.5589, + "step": 10842, + "task_loss": 0.35312169790267944 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.590694010257721, + "epoch": 9.17, + "learning_rate": 4.6351084812623274e-06, + "loss": 0.513, + "step": 10843, + "task_loss": 1.1163192987442017 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5915704965591431, + "epoch": 9.17, + "learning_rate": 4.6304123227200155e-06, + "loss": 0.7493, + "step": 10844, + "task_loss": 0.5995631814002991 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.28608807921409607, + "epoch": 9.17, + "learning_rate": 4.625716164177703e-06, + "loss": 0.5645, + "step": 10845, + "task_loss": 0.155314639210701 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6432895660400391, + "epoch": 9.17, + "learning_rate": 4.621020005635391e-06, + "loss": 0.5592, + "step": 10846, + "task_loss": 0.6715465784072876 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5430681705474854, + "epoch": 9.17, + "learning_rate": 4.616323847093078e-06, + "loss": 0.4852, + "step": 10847, + "task_loss": 0.7355966567993164 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4202154576778412, + "epoch": 9.17, + "learning_rate": 4.611627688550766e-06, + "loss": 0.3948, + "step": 10848, + "task_loss": 0.5021507143974304 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6153039932250977, + "epoch": 9.17, + "learning_rate": 4.606931530008454e-06, + "loss": 0.5353, + "step": 10849, + "task_loss": 0.4856630861759186 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.48080381751060486, + "epoch": 9.17, + "learning_rate": 4.602235371466141e-06, + "loss": 0.5159, + "step": 10850, + "task_loss": 0.945907473564148 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.47829657793045044, + "epoch": 9.17, + "learning_rate": 4.597539212923828e-06, + "loss": 0.4869, + "step": 10851, + "task_loss": 0.8005138635635376 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5382614731788635, + "epoch": 9.17, + "learning_rate": 4.592843054381516e-06, + "loss": 0.4734, + "step": 10852, + "task_loss": 0.7610917687416077 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6903530359268188, + "epoch": 9.17, + "learning_rate": 4.5881468958392035e-06, + "loss": 0.7259, + "step": 10853, + "task_loss": 1.2119572162628174 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9453556537628174, + "epoch": 9.17, + "learning_rate": 4.5834507372968916e-06, + "loss": 0.6248, + "step": 10854, + "task_loss": 1.0381531715393066 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5619428157806396, + "epoch": 9.18, + "learning_rate": 4.578754578754579e-06, + "loss": 0.6098, + "step": 10855, + "task_loss": 1.1913701295852661 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7246257662773132, + "epoch": 9.18, + "learning_rate": 4.574058420212267e-06, + "loss": 0.5533, + "step": 10856, + "task_loss": 1.5235695838928223 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4764540493488312, + "epoch": 9.18, + "learning_rate": 4.569362261669955e-06, + "loss": 0.5469, + "step": 10857, + "task_loss": 0.22802934050559998 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.797978401184082, + "epoch": 9.18, + "learning_rate": 4.564666103127642e-06, + "loss": 0.6318, + "step": 10858, + "task_loss": 0.49362775683403015 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.32643651962280273, + "epoch": 9.18, + "learning_rate": 4.559969944585329e-06, + "loss": 0.4707, + "step": 10859, + "task_loss": 0.46889522671699524 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.25688478350639343, + "epoch": 9.18, + "learning_rate": 4.555273786043017e-06, + "loss": 0.4528, + "step": 10860, + "task_loss": 0.4083934724330902 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4214138090610504, + "epoch": 9.18, + "learning_rate": 4.550577627500704e-06, + "loss": 0.5602, + "step": 10861, + "task_loss": 0.40938860177993774 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4213910698890686, + "epoch": 9.18, + "learning_rate": 4.545881468958392e-06, + "loss": 0.5055, + "step": 10862, + "task_loss": 0.16619780659675598 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3822334408760071, + "epoch": 9.18, + "learning_rate": 4.54118531041608e-06, + "loss": 0.3667, + "step": 10863, + "task_loss": 0.43113839626312256 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3260955810546875, + "epoch": 9.18, + "learning_rate": 4.536489151873768e-06, + "loss": 0.5029, + "step": 10864, + "task_loss": 1.480652093887329 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4976132810115814, + "epoch": 9.18, + "learning_rate": 4.531792993331455e-06, + "loss": 0.4908, + "step": 10865, + "task_loss": 0.5951100587844849 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.31109315156936646, + "epoch": 9.19, + "learning_rate": 4.527096834789143e-06, + "loss": 0.4729, + "step": 10866, + "task_loss": 1.3585652112960815 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.27515026926994324, + "epoch": 9.19, + "learning_rate": 4.52240067624683e-06, + "loss": 0.4654, + "step": 10867, + "task_loss": 0.510993242263794 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5281930565834045, + "epoch": 9.19, + "learning_rate": 4.517704517704518e-06, + "loss": 0.5031, + "step": 10868, + "task_loss": 0.548530638217926 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.679369330406189, + "epoch": 9.19, + "learning_rate": 4.513008359162205e-06, + "loss": 0.5637, + "step": 10869, + "task_loss": 0.22896431386470795 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6705942153930664, + "epoch": 9.19, + "learning_rate": 4.5083122006198925e-06, + "loss": 0.5419, + "step": 10870, + "task_loss": 0.7425059080123901 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6122167706489563, + "epoch": 9.19, + "learning_rate": 4.503616042077581e-06, + "loss": 0.5378, + "step": 10871, + "task_loss": 1.1634297370910645 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6949007511138916, + "epoch": 9.19, + "learning_rate": 4.4989198835352685e-06, + "loss": 0.7366, + "step": 10872, + "task_loss": 1.2006893157958984 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4023021459579468, + "epoch": 9.19, + "learning_rate": 4.494223724992956e-06, + "loss": 0.5056, + "step": 10873, + "task_loss": 0.24474762380123138 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5639364719390869, + "epoch": 9.19, + "learning_rate": 4.489527566450644e-06, + "loss": 0.6467, + "step": 10874, + "task_loss": 1.7069226503372192 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3295495808124542, + "epoch": 9.19, + "learning_rate": 4.484831407908331e-06, + "loss": 0.5429, + "step": 10875, + "task_loss": 0.46266359090805054 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3693598210811615, + "epoch": 9.19, + "learning_rate": 4.480135249366019e-06, + "loss": 0.6523, + "step": 10876, + "task_loss": 1.045897364616394 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7294018268585205, + "epoch": 9.19, + "learning_rate": 4.475439090823706e-06, + "loss": 0.535, + "step": 10877, + "task_loss": 0.5471966862678528 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.36468178033828735, + "epoch": 9.2, + "learning_rate": 4.470742932281393e-06, + "loss": 0.4697, + "step": 10878, + "task_loss": 0.08477520942687988 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7204464673995972, + "epoch": 9.2, + "learning_rate": 4.466046773739082e-06, + "loss": 0.5768, + "step": 10879, + "task_loss": 1.8280061483383179 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5907106399536133, + "epoch": 9.2, + "learning_rate": 4.4613506151967694e-06, + "loss": 0.6047, + "step": 10880, + "task_loss": 0.15427885949611664 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7866789102554321, + "epoch": 9.2, + "learning_rate": 4.456654456654457e-06, + "loss": 0.5545, + "step": 10881, + "task_loss": 0.720936119556427 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.43583813309669495, + "epoch": 9.2, + "learning_rate": 4.451958298112145e-06, + "loss": 0.5237, + "step": 10882, + "task_loss": 0.6013363003730774 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.48178181052207947, + "epoch": 9.2, + "learning_rate": 4.447262139569832e-06, + "loss": 0.6936, + "step": 10883, + "task_loss": 0.6246921420097351 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8088822960853577, + "epoch": 9.2, + "learning_rate": 4.44256598102752e-06, + "loss": 0.6107, + "step": 10884, + "task_loss": 0.8761305212974548 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4401460886001587, + "epoch": 9.2, + "learning_rate": 4.437869822485207e-06, + "loss": 0.5644, + "step": 10885, + "task_loss": 0.0860443264245987 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.34712502360343933, + "epoch": 9.2, + "learning_rate": 4.433173663942894e-06, + "loss": 0.5432, + "step": 10886, + "task_loss": 0.4822004735469818 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8429248929023743, + "epoch": 9.2, + "learning_rate": 4.428477505400583e-06, + "loss": 0.5896, + "step": 10887, + "task_loss": 1.1455215215682983 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.47533881664276123, + "epoch": 9.2, + "learning_rate": 4.42378134685827e-06, + "loss": 0.4811, + "step": 10888, + "task_loss": 1.444555401802063 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5070008635520935, + "epoch": 9.2, + "learning_rate": 4.4190851883159575e-06, + "loss": 0.6044, + "step": 10889, + "task_loss": 0.40574362874031067 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5617731809616089, + "epoch": 9.21, + "learning_rate": 4.4143890297736455e-06, + "loss": 0.5326, + "step": 10890, + "task_loss": 1.3129022121429443 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5047394037246704, + "epoch": 9.21, + "learning_rate": 4.409692871231333e-06, + "loss": 0.5574, + "step": 10891, + "task_loss": 0.27070388197898865 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.3662519454956055, + "epoch": 9.21, + "learning_rate": 4.404996712689021e-06, + "loss": 0.6919, + "step": 10892, + "task_loss": 0.7481173276901245 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6432501077651978, + "epoch": 9.21, + "learning_rate": 4.400300554146708e-06, + "loss": 0.6158, + "step": 10893, + "task_loss": 0.874040961265564 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5450224876403809, + "epoch": 9.21, + "learning_rate": 4.395604395604396e-06, + "loss": 0.4155, + "step": 10894, + "task_loss": 0.7032738327980042 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6036750078201294, + "epoch": 9.21, + "learning_rate": 4.390908237062084e-06, + "loss": 0.5721, + "step": 10895, + "task_loss": 0.4670071601867676 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4691944420337677, + "epoch": 9.21, + "learning_rate": 4.386212078519771e-06, + "loss": 0.6053, + "step": 10896, + "task_loss": 0.9119598865509033 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4413752257823944, + "epoch": 9.21, + "learning_rate": 4.381515919977458e-06, + "loss": 0.4408, + "step": 10897, + "task_loss": 0.44342488050460815 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.42831915616989136, + "epoch": 9.21, + "learning_rate": 4.376819761435146e-06, + "loss": 0.4601, + "step": 10898, + "task_loss": 0.37882789969444275 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5898022651672363, + "epoch": 9.21, + "learning_rate": 4.372123602892834e-06, + "loss": 0.5215, + "step": 10899, + "task_loss": 0.8129600286483765 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5273644328117371, + "epoch": 9.21, + "learning_rate": 4.367427444350522e-06, + "loss": 0.5033, + "step": 10900, + "task_loss": 0.8919691443443298 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5367013812065125, + "epoch": 9.21, + "learning_rate": 4.362731285808209e-06, + "loss": 0.5728, + "step": 10901, + "task_loss": 0.8962957262992859 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4875826835632324, + "epoch": 9.22, + "learning_rate": 4.358035127265897e-06, + "loss": 0.4642, + "step": 10902, + "task_loss": 0.5162282586097717 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4924187958240509, + "epoch": 9.22, + "learning_rate": 4.353338968723585e-06, + "loss": 0.6292, + "step": 10903, + "task_loss": 0.3756810128688812 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4626406133174896, + "epoch": 9.22, + "learning_rate": 4.348642810181272e-06, + "loss": 0.5774, + "step": 10904, + "task_loss": 0.5998739004135132 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4531788229942322, + "epoch": 9.22, + "learning_rate": 4.343946651638959e-06, + "loss": 0.5687, + "step": 10905, + "task_loss": 0.6416094303131104 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6396664977073669, + "epoch": 9.22, + "learning_rate": 4.339250493096647e-06, + "loss": 0.5488, + "step": 10906, + "task_loss": 0.9034744501113892 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5526031851768494, + "epoch": 9.22, + "learning_rate": 4.3345543345543345e-06, + "loss": 0.6191, + "step": 10907, + "task_loss": 0.7673710584640503 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5250399112701416, + "epoch": 9.22, + "learning_rate": 4.3298581760120225e-06, + "loss": 0.4553, + "step": 10908, + "task_loss": 0.49567198753356934 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6319273114204407, + "epoch": 9.22, + "learning_rate": 4.32516201746971e-06, + "loss": 0.5483, + "step": 10909, + "task_loss": 0.9601724147796631 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5628266334533691, + "epoch": 9.22, + "learning_rate": 4.320465858927398e-06, + "loss": 0.5726, + "step": 10910, + "task_loss": 0.9729996919631958 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5117441415786743, + "epoch": 9.22, + "learning_rate": 4.315769700385086e-06, + "loss": 0.5154, + "step": 10911, + "task_loss": 1.0214203596115112 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.39926576614379883, + "epoch": 9.22, + "learning_rate": 4.311073541842773e-06, + "loss": 0.5379, + "step": 10912, + "task_loss": 0.44812148809432983 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5475550293922424, + "epoch": 9.22, + "learning_rate": 4.30637738330046e-06, + "loss": 0.6162, + "step": 10913, + "task_loss": 0.7654815316200256 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.46837982535362244, + "epoch": 9.23, + "learning_rate": 4.301681224758148e-06, + "loss": 0.4933, + "step": 10914, + "task_loss": 1.0316585302352905 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.2505451440811157, + "epoch": 9.23, + "learning_rate": 4.296985066215835e-06, + "loss": 0.3806, + "step": 10915, + "task_loss": 0.6235789656639099 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7343634366989136, + "epoch": 9.23, + "learning_rate": 4.292288907673523e-06, + "loss": 0.5451, + "step": 10916, + "task_loss": 1.1542083024978638 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6989647150039673, + "epoch": 9.23, + "learning_rate": 4.287592749131211e-06, + "loss": 0.6421, + "step": 10917, + "task_loss": 1.1470345258712769 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.41732776165008545, + "epoch": 9.23, + "learning_rate": 4.282896590588899e-06, + "loss": 0.6963, + "step": 10918, + "task_loss": 0.34159597754478455 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4899551272392273, + "epoch": 9.23, + "learning_rate": 4.278200432046587e-06, + "loss": 0.5992, + "step": 10919, + "task_loss": 0.8436910510063171 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5855897665023804, + "epoch": 9.23, + "learning_rate": 4.273504273504274e-06, + "loss": 0.463, + "step": 10920, + "task_loss": 0.6768660545349121 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6877353191375732, + "epoch": 9.23, + "learning_rate": 4.268808114961961e-06, + "loss": 0.5718, + "step": 10921, + "task_loss": 0.9600094556808472 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3731490969657898, + "epoch": 9.23, + "learning_rate": 4.264111956419649e-06, + "loss": 0.5394, + "step": 10922, + "task_loss": 0.47793394327163696 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9520018100738525, + "epoch": 9.23, + "learning_rate": 4.259415797877336e-06, + "loss": 0.745, + "step": 10923, + "task_loss": 1.4026192426681519 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.26013728976249695, + "epoch": 9.23, + "learning_rate": 4.254719639335024e-06, + "loss": 0.4093, + "step": 10924, + "task_loss": 0.09092708677053452 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6342916488647461, + "epoch": 9.23, + "learning_rate": 4.250023480792712e-06, + "loss": 0.4316, + "step": 10925, + "task_loss": 0.8332868218421936 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.522832989692688, + "epoch": 9.24, + "learning_rate": 4.2453273222503995e-06, + "loss": 0.4659, + "step": 10926, + "task_loss": 0.5855490565299988 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.43368786573410034, + "epoch": 9.24, + "learning_rate": 4.2406311637080875e-06, + "loss": 0.3576, + "step": 10927, + "task_loss": 0.07296571880578995 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4034872055053711, + "epoch": 9.24, + "learning_rate": 4.235935005165775e-06, + "loss": 0.4917, + "step": 10928, + "task_loss": 0.5275158286094666 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.30411869287490845, + "epoch": 9.24, + "learning_rate": 4.231238846623462e-06, + "loss": 0.3645, + "step": 10929, + "task_loss": 0.38052770495414734 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3986460864543915, + "epoch": 9.24, + "learning_rate": 4.22654268808115e-06, + "loss": 0.5812, + "step": 10930, + "task_loss": 0.5870955586433411 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5513052940368652, + "epoch": 9.24, + "learning_rate": 4.221846529538837e-06, + "loss": 0.5743, + "step": 10931, + "task_loss": 1.5133676528930664 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7492802739143372, + "epoch": 9.24, + "learning_rate": 4.217150370996525e-06, + "loss": 0.5667, + "step": 10932, + "task_loss": 1.339775800704956 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.568235456943512, + "epoch": 9.24, + "learning_rate": 4.212454212454213e-06, + "loss": 0.4528, + "step": 10933, + "task_loss": 0.6803421378135681 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3524261713027954, + "epoch": 9.24, + "learning_rate": 4.2077580539119e-06, + "loss": 0.3354, + "step": 10934, + "task_loss": 0.4697146713733673 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4600464999675751, + "epoch": 9.24, + "learning_rate": 4.203061895369588e-06, + "loss": 0.4703, + "step": 10935, + "task_loss": 0.592668890953064 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9647623896598816, + "epoch": 9.24, + "learning_rate": 4.198365736827276e-06, + "loss": 0.623, + "step": 10936, + "task_loss": 0.6262195110321045 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8514769673347473, + "epoch": 9.24, + "learning_rate": 4.193669578284963e-06, + "loss": 0.6031, + "step": 10937, + "task_loss": 0.7583869099617004 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.622306764125824, + "epoch": 9.25, + "learning_rate": 4.188973419742651e-06, + "loss": 0.6106, + "step": 10938, + "task_loss": 0.41055312752723694 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7921923398971558, + "epoch": 9.25, + "learning_rate": 4.184277261200338e-06, + "loss": 0.7328, + "step": 10939, + "task_loss": 1.082035779953003 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7467060089111328, + "epoch": 9.25, + "learning_rate": 4.179581102658025e-06, + "loss": 0.5168, + "step": 10940, + "task_loss": 0.24246633052825928 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5491392016410828, + "epoch": 9.25, + "learning_rate": 4.174884944115714e-06, + "loss": 0.558, + "step": 10941, + "task_loss": 1.1125648021697998 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4908609688282013, + "epoch": 9.25, + "learning_rate": 4.170188785573401e-06, + "loss": 0.4992, + "step": 10942, + "task_loss": 0.6146554350852966 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4285265803337097, + "epoch": 9.25, + "learning_rate": 4.1654926270310885e-06, + "loss": 0.5121, + "step": 10943, + "task_loss": 0.8556405305862427 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.49097099900245667, + "epoch": 9.25, + "learning_rate": 4.1607964684887765e-06, + "loss": 0.4452, + "step": 10944, + "task_loss": 0.48642709851264954 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7617306709289551, + "epoch": 9.25, + "learning_rate": 4.156100309946464e-06, + "loss": 0.543, + "step": 10945, + "task_loss": 1.170119285583496 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8503702878952026, + "epoch": 9.25, + "learning_rate": 4.151404151404152e-06, + "loss": 0.6929, + "step": 10946, + "task_loss": 1.1385785341262817 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5523126125335693, + "epoch": 9.25, + "learning_rate": 4.146707992861839e-06, + "loss": 0.5942, + "step": 10947, + "task_loss": 0.7568174004554749 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8795843720436096, + "epoch": 9.25, + "learning_rate": 4.142011834319527e-06, + "loss": 0.6733, + "step": 10948, + "task_loss": 0.966051459312439 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7091909646987915, + "epoch": 9.26, + "learning_rate": 4.137315675777215e-06, + "loss": 0.6119, + "step": 10949, + "task_loss": 1.6939926147460938 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.2957199215888977, + "epoch": 9.26, + "learning_rate": 4.132619517234902e-06, + "loss": 0.5533, + "step": 10950, + "task_loss": 0.9509096741676331 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.37209826707839966, + "epoch": 9.26, + "learning_rate": 4.127923358692589e-06, + "loss": 0.6491, + "step": 10951, + "task_loss": 0.7759472131729126 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3615013659000397, + "epoch": 9.26, + "learning_rate": 4.123227200150277e-06, + "loss": 0.4066, + "step": 10952, + "task_loss": 0.8642329573631287 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6423311829566956, + "epoch": 9.26, + "learning_rate": 4.1185310416079646e-06, + "loss": 0.7173, + "step": 10953, + "task_loss": 0.858659565448761 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6531955003738403, + "epoch": 9.26, + "learning_rate": 4.113834883065653e-06, + "loss": 0.5675, + "step": 10954, + "task_loss": 0.7577497363090515 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3250599503517151, + "epoch": 9.26, + "learning_rate": 4.10913872452334e-06, + "loss": 0.4185, + "step": 10955, + "task_loss": 0.8010559678077698 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.45157450437545776, + "epoch": 9.26, + "learning_rate": 4.104442565981028e-06, + "loss": 0.4836, + "step": 10956, + "task_loss": 1.5399510860443115 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5517041683197021, + "epoch": 9.26, + "learning_rate": 4.099746407438716e-06, + "loss": 0.6736, + "step": 10957, + "task_loss": 0.7331752777099609 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.38505929708480835, + "epoch": 9.26, + "learning_rate": 4.095050248896403e-06, + "loss": 0.586, + "step": 10958, + "task_loss": 0.2563443183898926 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8032345175743103, + "epoch": 9.26, + "learning_rate": 4.09035409035409e-06, + "loss": 0.6244, + "step": 10959, + "task_loss": 0.26627886295318604 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.26758769154548645, + "epoch": 9.26, + "learning_rate": 4.085657931811778e-06, + "loss": 0.4273, + "step": 10960, + "task_loss": 0.10821530967950821 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3220205307006836, + "epoch": 9.27, + "learning_rate": 4.0809617732694654e-06, + "loss": 0.4937, + "step": 10961, + "task_loss": 1.168175220489502 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.41894710063934326, + "epoch": 9.27, + "learning_rate": 4.0762656147271535e-06, + "loss": 0.4654, + "step": 10962, + "task_loss": 0.25754567980766296 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5783056020736694, + "epoch": 9.27, + "learning_rate": 4.071569456184841e-06, + "loss": 0.4733, + "step": 10963, + "task_loss": 0.5873746871948242 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5809851288795471, + "epoch": 9.27, + "learning_rate": 4.066873297642529e-06, + "loss": 0.4767, + "step": 10964, + "task_loss": 0.693004310131073 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.33629119396209717, + "epoch": 9.27, + "learning_rate": 4.062177139100217e-06, + "loss": 0.4741, + "step": 10965, + "task_loss": 0.526307225227356 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5074222683906555, + "epoch": 9.27, + "learning_rate": 4.057480980557904e-06, + "loss": 0.5677, + "step": 10966, + "task_loss": 1.1005892753601074 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7590937614440918, + "epoch": 9.27, + "learning_rate": 4.052784822015591e-06, + "loss": 0.5201, + "step": 10967, + "task_loss": 0.48172539472579956 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6252413392066956, + "epoch": 9.27, + "learning_rate": 4.048088663473279e-06, + "loss": 0.6395, + "step": 10968, + "task_loss": 1.1368328332901 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5254808068275452, + "epoch": 9.27, + "learning_rate": 4.043392504930966e-06, + "loss": 0.4691, + "step": 10969, + "task_loss": 0.893790066242218 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4146735668182373, + "epoch": 9.27, + "learning_rate": 4.038696346388654e-06, + "loss": 0.456, + "step": 10970, + "task_loss": 0.5177074670791626 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.41818273067474365, + "epoch": 9.27, + "learning_rate": 4.034000187846342e-06, + "loss": 0.4448, + "step": 10971, + "task_loss": 1.249911904335022 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3412929177284241, + "epoch": 9.27, + "learning_rate": 4.0293040293040296e-06, + "loss": 0.4102, + "step": 10972, + "task_loss": 0.1637885868549347 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3158021867275238, + "epoch": 9.28, + "learning_rate": 4.024607870761718e-06, + "loss": 0.4618, + "step": 10973, + "task_loss": 0.2861343026161194 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4541577100753784, + "epoch": 9.28, + "learning_rate": 4.019911712219405e-06, + "loss": 0.5382, + "step": 10974, + "task_loss": 0.2660941183567047 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6278258562088013, + "epoch": 9.28, + "learning_rate": 4.015215553677092e-06, + "loss": 0.5823, + "step": 10975, + "task_loss": 1.1680188179016113 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5896298289299011, + "epoch": 9.28, + "learning_rate": 4.01051939513478e-06, + "loss": 0.5628, + "step": 10976, + "task_loss": 0.45311036705970764 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3496105372905731, + "epoch": 9.28, + "learning_rate": 4.005823236592467e-06, + "loss": 0.4949, + "step": 10977, + "task_loss": 0.44767728447914124 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4825911819934845, + "epoch": 9.28, + "learning_rate": 4.001127078050155e-06, + "loss": 0.6344, + "step": 10978, + "task_loss": 0.7057150602340698 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.753537654876709, + "epoch": 9.28, + "learning_rate": 3.996430919507843e-06, + "loss": 0.5862, + "step": 10979, + "task_loss": 0.869903028011322 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5526801943778992, + "epoch": 9.28, + "learning_rate": 3.9917347609655305e-06, + "loss": 0.4746, + "step": 10980, + "task_loss": 0.7394396066665649 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9483023881912231, + "epoch": 9.28, + "learning_rate": 3.9870386024232185e-06, + "loss": 0.5849, + "step": 10981, + "task_loss": 2.007902145385742 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4774699807167053, + "epoch": 9.28, + "learning_rate": 3.982342443880906e-06, + "loss": 0.5065, + "step": 10982, + "task_loss": 0.17856507003307343 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8343600630760193, + "epoch": 9.28, + "learning_rate": 3.977646285338593e-06, + "loss": 0.6481, + "step": 10983, + "task_loss": 1.2437623739242554 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.42253315448760986, + "epoch": 9.28, + "learning_rate": 3.972950126796281e-06, + "loss": 0.4204, + "step": 10984, + "task_loss": 0.8215905427932739 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3095882833003998, + "epoch": 9.29, + "learning_rate": 3.968253968253968e-06, + "loss": 0.5732, + "step": 10985, + "task_loss": 0.13707387447357178 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6788414716720581, + "epoch": 9.29, + "learning_rate": 3.963557809711656e-06, + "loss": 0.5556, + "step": 10986, + "task_loss": 0.6071580648422241 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.629853367805481, + "epoch": 9.29, + "learning_rate": 3.958861651169344e-06, + "loss": 0.6, + "step": 10987, + "task_loss": 0.7008403539657593 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3458211421966553, + "epoch": 9.29, + "learning_rate": 3.954165492627031e-06, + "loss": 0.4421, + "step": 10988, + "task_loss": 0.810414731502533 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4081161618232727, + "epoch": 9.29, + "learning_rate": 3.949469334084719e-06, + "loss": 0.5676, + "step": 10989, + "task_loss": 0.07090999186038971 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5614745020866394, + "epoch": 9.29, + "learning_rate": 3.9447731755424066e-06, + "loss": 0.6604, + "step": 10990, + "task_loss": 0.7482591867446899 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5996106863021851, + "epoch": 9.29, + "learning_rate": 3.940077017000094e-06, + "loss": 0.5584, + "step": 10991, + "task_loss": 1.13728928565979 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3209647834300995, + "epoch": 9.29, + "learning_rate": 3.935380858457782e-06, + "loss": 0.552, + "step": 10992, + "task_loss": 0.30690452456474304 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8693583011627197, + "epoch": 9.29, + "learning_rate": 3.930684699915469e-06, + "loss": 0.5777, + "step": 10993, + "task_loss": 1.0151220560073853 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.327233225107193, + "epoch": 9.29, + "learning_rate": 3.925988541373157e-06, + "loss": 0.3927, + "step": 10994, + "task_loss": 0.67607182264328 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9752513766288757, + "epoch": 9.29, + "learning_rate": 3.921292382830845e-06, + "loss": 0.508, + "step": 10995, + "task_loss": 0.8663730621337891 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3999868929386139, + "epoch": 9.29, + "learning_rate": 3.916596224288532e-06, + "loss": 0.4652, + "step": 10996, + "task_loss": 0.5222601294517517 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5390021800994873, + "epoch": 9.3, + "learning_rate": 3.91190006574622e-06, + "loss": 0.6144, + "step": 10997, + "task_loss": 0.6221882104873657 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7796196937561035, + "epoch": 9.3, + "learning_rate": 3.9072039072039074e-06, + "loss": 0.5226, + "step": 10998, + "task_loss": 1.2386220693588257 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5757112503051758, + "epoch": 9.3, + "learning_rate": 3.902507748661595e-06, + "loss": 0.4222, + "step": 10999, + "task_loss": 0.43675899505615234 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5463241338729858, + "epoch": 9.3, + "learning_rate": 3.897811590119283e-06, + "loss": 0.5485, + "step": 11000, + "task_loss": 0.4274149239063263 + }, + { + "epoch": 9.3, + "eval_accuracy": 0.9038811881188119, + "eval_loss": 0.35764050483703613, + "eval_runtime": 225.2382, + "eval_samples_per_second": 112.104, + "eval_steps_per_second": 0.879, + "step": 11000 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7140568494796753, + "epoch": 9.3, + "learning_rate": 3.89311543157697e-06, + "loss": 0.7323, + "step": 11001, + "task_loss": 0.5698778629302979 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.43075352907180786, + "epoch": 9.3, + "learning_rate": 3.888419273034658e-06, + "loss": 0.6062, + "step": 11002, + "task_loss": 1.105376124382019 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5407055616378784, + "epoch": 9.3, + "learning_rate": 3.883723114492346e-06, + "loss": 0.5877, + "step": 11003, + "task_loss": 1.01487398147583 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7124694585800171, + "epoch": 9.3, + "learning_rate": 3.879026955950033e-06, + "loss": 0.4999, + "step": 11004, + "task_loss": 1.0324440002441406 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6259205937385559, + "epoch": 9.3, + "learning_rate": 3.874330797407721e-06, + "loss": 0.587, + "step": 11005, + "task_loss": 0.7234367728233337 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6852434873580933, + "epoch": 9.3, + "learning_rate": 3.869634638865408e-06, + "loss": 0.6335, + "step": 11006, + "task_loss": 0.2126229703426361 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.1977427005767822, + "epoch": 9.3, + "learning_rate": 3.8649384803230955e-06, + "loss": 0.8879, + "step": 11007, + "task_loss": 1.018966794013977 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4569547772407532, + "epoch": 9.3, + "learning_rate": 3.8602423217807835e-06, + "loss": 0.3863, + "step": 11008, + "task_loss": 0.8507957458496094 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.2581004798412323, + "epoch": 9.31, + "learning_rate": 3.855546163238471e-06, + "loss": 0.4681, + "step": 11009, + "task_loss": 0.3404015600681305 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4858361482620239, + "epoch": 9.31, + "learning_rate": 3.850850004696159e-06, + "loss": 0.4896, + "step": 11010, + "task_loss": 0.5981464982032776 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.35521265864372253, + "epoch": 9.31, + "learning_rate": 3.846153846153847e-06, + "loss": 0.5955, + "step": 11011, + "task_loss": 0.2629264295101166 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8452341556549072, + "epoch": 9.31, + "learning_rate": 3.841457687611534e-06, + "loss": 0.7454, + "step": 11012, + "task_loss": 0.5197286009788513 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3735959231853485, + "epoch": 9.31, + "learning_rate": 3.836761529069222e-06, + "loss": 0.4526, + "step": 11013, + "task_loss": 0.12611602246761322 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6044589281082153, + "epoch": 9.31, + "learning_rate": 3.832065370526909e-06, + "loss": 0.5917, + "step": 11014, + "task_loss": 0.38324275612831116 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5282130241394043, + "epoch": 9.31, + "learning_rate": 3.827369211984596e-06, + "loss": 0.4926, + "step": 11015, + "task_loss": 0.7896764278411865 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.572511613368988, + "epoch": 9.31, + "learning_rate": 3.822673053442284e-06, + "loss": 0.5863, + "step": 11016, + "task_loss": 0.8338715434074402 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5187878608703613, + "epoch": 9.31, + "learning_rate": 3.817976894899972e-06, + "loss": 0.5099, + "step": 11017, + "task_loss": 0.4736097455024719 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.48359987139701843, + "epoch": 9.31, + "learning_rate": 3.81328073635766e-06, + "loss": 0.5478, + "step": 11018, + "task_loss": 1.1570225954055786 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3049069344997406, + "epoch": 9.31, + "learning_rate": 3.8085845778153473e-06, + "loss": 0.5006, + "step": 11019, + "task_loss": 0.4704192280769348 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5359009504318237, + "epoch": 9.32, + "learning_rate": 3.803888419273035e-06, + "loss": 0.6086, + "step": 11020, + "task_loss": 0.6723226308822632 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6238926649093628, + "epoch": 9.32, + "learning_rate": 3.7991922607307225e-06, + "loss": 0.5637, + "step": 11021, + "task_loss": 0.486113041639328 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4815084934234619, + "epoch": 9.32, + "learning_rate": 3.79449610218841e-06, + "loss": 0.5461, + "step": 11022, + "task_loss": 0.7111560106277466 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9258511662483215, + "epoch": 9.32, + "learning_rate": 3.7897999436460977e-06, + "loss": 0.5372, + "step": 11023, + "task_loss": 0.5527610182762146 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9421027898788452, + "epoch": 9.32, + "learning_rate": 3.785103785103785e-06, + "loss": 0.6362, + "step": 11024, + "task_loss": 0.9712353944778442 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9945833683013916, + "epoch": 9.32, + "learning_rate": 3.7804076265614733e-06, + "loss": 0.5268, + "step": 11025, + "task_loss": 1.420912504196167 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3107377886772156, + "epoch": 9.32, + "learning_rate": 3.775711468019161e-06, + "loss": 0.5698, + "step": 11026, + "task_loss": 0.7533959746360779 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6495245695114136, + "epoch": 9.32, + "learning_rate": 3.771015309476848e-06, + "loss": 0.4107, + "step": 11027, + "task_loss": 1.245865821838379 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9657779932022095, + "epoch": 9.32, + "learning_rate": 3.7663191509345357e-06, + "loss": 0.635, + "step": 11028, + "task_loss": 1.0642495155334473 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4579669237136841, + "epoch": 9.32, + "learning_rate": 3.7616229923922234e-06, + "loss": 0.4954, + "step": 11029, + "task_loss": 1.1511746644973755 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6782408952713013, + "epoch": 9.32, + "learning_rate": 3.756926833849911e-06, + "loss": 0.5067, + "step": 11030, + "task_loss": 0.6037449836730957 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.902891218662262, + "epoch": 9.32, + "learning_rate": 3.7522306753075986e-06, + "loss": 0.5962, + "step": 11031, + "task_loss": 1.0377341508865356 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6460734605789185, + "epoch": 9.33, + "learning_rate": 3.7475345167652858e-06, + "loss": 0.5144, + "step": 11032, + "task_loss": 0.9097421765327454 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.46765586733818054, + "epoch": 9.33, + "learning_rate": 3.7428383582229742e-06, + "loss": 0.5772, + "step": 11033, + "task_loss": 0.4408780038356781 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6825538873672485, + "epoch": 9.33, + "learning_rate": 3.738142199680662e-06, + "loss": 0.6653, + "step": 11034, + "task_loss": 0.528090238571167 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.572077214717865, + "epoch": 9.33, + "learning_rate": 3.733446041138349e-06, + "loss": 0.5151, + "step": 11035, + "task_loss": 1.003177523612976 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.78909832239151, + "epoch": 9.33, + "learning_rate": 3.7287498825960366e-06, + "loss": 0.6346, + "step": 11036, + "task_loss": 0.9628617763519287 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.27340149879455566, + "epoch": 9.33, + "learning_rate": 3.7240537240537242e-06, + "loss": 0.4802, + "step": 11037, + "task_loss": 0.42176955938339233 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6290633082389832, + "epoch": 9.33, + "learning_rate": 3.719357565511412e-06, + "loss": 0.5366, + "step": 11038, + "task_loss": 1.171726942062378 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6020877361297607, + "epoch": 9.33, + "learning_rate": 3.714661406969099e-06, + "loss": 0.569, + "step": 11039, + "task_loss": 0.8706861138343811 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.603912353515625, + "epoch": 9.33, + "learning_rate": 3.7099652484267866e-06, + "loss": 0.6198, + "step": 11040, + "task_loss": 0.7017185688018799 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.30134326219558716, + "epoch": 9.33, + "learning_rate": 3.705269089884475e-06, + "loss": 0.5152, + "step": 11041, + "task_loss": 0.7170421481132507 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5750234723091125, + "epoch": 9.33, + "learning_rate": 3.7005729313421623e-06, + "loss": 0.5655, + "step": 11042, + "task_loss": 0.5489616990089417 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.70073002576828, + "epoch": 9.33, + "learning_rate": 3.69587677279985e-06, + "loss": 0.6977, + "step": 11043, + "task_loss": 1.563845157623291 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4483509659767151, + "epoch": 9.34, + "learning_rate": 3.6911806142575375e-06, + "loss": 0.6021, + "step": 11044, + "task_loss": 0.8907885551452637 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5995936989784241, + "epoch": 9.34, + "learning_rate": 3.686484455715225e-06, + "loss": 0.5584, + "step": 11045, + "task_loss": 1.0019899606704712 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6675196290016174, + "epoch": 9.34, + "learning_rate": 3.6817882971729127e-06, + "loss": 0.5926, + "step": 11046, + "task_loss": 0.529183566570282 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6787434816360474, + "epoch": 9.34, + "learning_rate": 3.6770921386306e-06, + "loss": 0.6068, + "step": 11047, + "task_loss": 0.3641342520713806 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3751561641693115, + "epoch": 9.34, + "learning_rate": 3.6723959800882875e-06, + "loss": 0.5904, + "step": 11048, + "task_loss": 0.34313997626304626 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.500418484210968, + "epoch": 9.34, + "learning_rate": 3.667699821545976e-06, + "loss": 0.5748, + "step": 11049, + "task_loss": 0.19498911499977112 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.34913453459739685, + "epoch": 9.34, + "learning_rate": 3.663003663003663e-06, + "loss": 0.4181, + "step": 11050, + "task_loss": 0.5265745520591736 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7342382073402405, + "epoch": 9.34, + "learning_rate": 3.6583075044613508e-06, + "loss": 0.5721, + "step": 11051, + "task_loss": 0.22351816296577454 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.39087793231010437, + "epoch": 9.34, + "learning_rate": 3.6536113459190384e-06, + "loss": 0.5917, + "step": 11052, + "task_loss": 0.6375265121459961 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.644103467464447, + "epoch": 9.34, + "learning_rate": 3.648915187376726e-06, + "loss": 0.4524, + "step": 11053, + "task_loss": 0.6648716330528259 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5738146901130676, + "epoch": 9.34, + "learning_rate": 3.6442190288344136e-06, + "loss": 0.5266, + "step": 11054, + "task_loss": 0.42964690923690796 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6777070164680481, + "epoch": 9.34, + "learning_rate": 3.639522870292101e-06, + "loss": 0.5765, + "step": 11055, + "task_loss": 0.7385520935058594 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5895520448684692, + "epoch": 9.35, + "learning_rate": 3.6348267117497893e-06, + "loss": 0.6798, + "step": 11056, + "task_loss": 1.3323516845703125 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5305094718933105, + "epoch": 9.35, + "learning_rate": 3.630130553207477e-06, + "loss": 0.5703, + "step": 11057, + "task_loss": 0.7368490099906921 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.36000561714172363, + "epoch": 9.35, + "learning_rate": 3.625434394665164e-06, + "loss": 0.4451, + "step": 11058, + "task_loss": 0.3638128638267517 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3327975869178772, + "epoch": 9.35, + "learning_rate": 3.6207382361228517e-06, + "loss": 0.4397, + "step": 11059, + "task_loss": 1.5285719633102417 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3917226791381836, + "epoch": 9.35, + "learning_rate": 3.6160420775805393e-06, + "loss": 0.7669, + "step": 11060, + "task_loss": 0.44794198870658875 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.641332745552063, + "epoch": 9.35, + "learning_rate": 3.611345919038227e-06, + "loss": 0.5377, + "step": 11061, + "task_loss": 0.5768241882324219 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.39325499534606934, + "epoch": 9.35, + "learning_rate": 3.6066497604959145e-06, + "loss": 0.4892, + "step": 11062, + "task_loss": 0.3297574818134308 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6134918928146362, + "epoch": 9.35, + "learning_rate": 3.6019536019536017e-06, + "loss": 0.5342, + "step": 11063, + "task_loss": 1.0496389865875244 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5329225063323975, + "epoch": 9.35, + "learning_rate": 3.59725744341129e-06, + "loss": 0.4608, + "step": 11064, + "task_loss": 0.7007267475128174 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3505586087703705, + "epoch": 9.35, + "learning_rate": 3.5925612848689777e-06, + "loss": 0.5724, + "step": 11065, + "task_loss": 0.1864093393087387 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7310865521430969, + "epoch": 9.35, + "learning_rate": 3.587865126326665e-06, + "loss": 0.598, + "step": 11066, + "task_loss": 0.6960495710372925 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7150852084159851, + "epoch": 9.35, + "learning_rate": 3.5831689677843525e-06, + "loss": 0.4768, + "step": 11067, + "task_loss": 0.952617347240448 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.762128472328186, + "epoch": 9.36, + "learning_rate": 3.57847280924204e-06, + "loss": 0.6121, + "step": 11068, + "task_loss": 1.0737755298614502 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.41830921173095703, + "epoch": 9.36, + "learning_rate": 3.5737766506997278e-06, + "loss": 0.4232, + "step": 11069, + "task_loss": 0.4652903974056244 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3023297190666199, + "epoch": 9.36, + "learning_rate": 3.569080492157415e-06, + "loss": 0.3323, + "step": 11070, + "task_loss": 0.4165792167186737 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.650453507900238, + "epoch": 9.36, + "learning_rate": 3.5643843336151026e-06, + "loss": 0.5822, + "step": 11071, + "task_loss": 0.8610770106315613 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6708232164382935, + "epoch": 9.36, + "learning_rate": 3.559688175072791e-06, + "loss": 0.5388, + "step": 11072, + "task_loss": 0.8451786637306213 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8068320155143738, + "epoch": 9.36, + "learning_rate": 3.5549920165304786e-06, + "loss": 0.6151, + "step": 11073, + "task_loss": 0.9385790824890137 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4990968108177185, + "epoch": 9.36, + "learning_rate": 3.550295857988166e-06, + "loss": 0.6511, + "step": 11074, + "task_loss": 0.3814873695373535 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.47068220376968384, + "epoch": 9.36, + "learning_rate": 3.5455996994458534e-06, + "loss": 0.4634, + "step": 11075, + "task_loss": 0.33107253909111023 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6775467991828918, + "epoch": 9.36, + "learning_rate": 3.540903540903541e-06, + "loss": 0.5738, + "step": 11076, + "task_loss": 0.8753811717033386 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.422080934047699, + "epoch": 9.36, + "learning_rate": 3.5362073823612286e-06, + "loss": 0.437, + "step": 11077, + "task_loss": 0.2625013291835785 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0858197212219238, + "epoch": 9.36, + "learning_rate": 3.531511223818916e-06, + "loss": 0.5868, + "step": 11078, + "task_loss": 0.7094920873641968 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.25350257754325867, + "epoch": 9.36, + "learning_rate": 3.5268150652766043e-06, + "loss": 0.4084, + "step": 11079, + "task_loss": 0.1762065589427948 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7187150716781616, + "epoch": 9.37, + "learning_rate": 3.522118906734292e-06, + "loss": 0.6457, + "step": 11080, + "task_loss": 1.1745599508285522 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3787701725959778, + "epoch": 9.37, + "learning_rate": 3.517422748191979e-06, + "loss": 0.4752, + "step": 11081, + "task_loss": 0.367812842130661 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6437379717826843, + "epoch": 9.37, + "learning_rate": 3.5127265896496667e-06, + "loss": 0.7166, + "step": 11082, + "task_loss": 1.4012805223464966 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.48033130168914795, + "epoch": 9.37, + "learning_rate": 3.5080304311073543e-06, + "loss": 0.5487, + "step": 11083, + "task_loss": 0.5171042084693909 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7722226977348328, + "epoch": 9.37, + "learning_rate": 3.503334272565042e-06, + "loss": 0.7103, + "step": 11084, + "task_loss": 1.2335305213928223 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4388309717178345, + "epoch": 9.37, + "learning_rate": 3.4986381140227295e-06, + "loss": 0.4219, + "step": 11085, + "task_loss": 0.924210786819458 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.33908790349960327, + "epoch": 9.37, + "learning_rate": 3.4939419554804167e-06, + "loss": 0.6567, + "step": 11086, + "task_loss": 0.4795485734939575 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.29407113790512085, + "epoch": 9.37, + "learning_rate": 3.489245796938105e-06, + "loss": 0.514, + "step": 11087, + "task_loss": 0.7161890864372253 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5133517980575562, + "epoch": 9.37, + "learning_rate": 3.4845496383957928e-06, + "loss": 0.5356, + "step": 11088, + "task_loss": 1.0491986274719238 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3746059834957123, + "epoch": 9.37, + "learning_rate": 3.47985347985348e-06, + "loss": 0.6627, + "step": 11089, + "task_loss": 0.8216314315795898 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.45719343423843384, + "epoch": 9.37, + "learning_rate": 3.4751573213111676e-06, + "loss": 0.5076, + "step": 11090, + "task_loss": 0.6946412920951843 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4834900498390198, + "epoch": 9.38, + "learning_rate": 3.470461162768855e-06, + "loss": 0.601, + "step": 11091, + "task_loss": 1.1683692932128906 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.41766154766082764, + "epoch": 9.38, + "learning_rate": 3.465765004226543e-06, + "loss": 0.5492, + "step": 11092, + "task_loss": 0.4272165894508362 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5390196442604065, + "epoch": 9.38, + "learning_rate": 3.4610688456842304e-06, + "loss": 0.461, + "step": 11093, + "task_loss": 0.586580753326416 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5226321220397949, + "epoch": 9.38, + "learning_rate": 3.4563726871419176e-06, + "loss": 0.4098, + "step": 11094, + "task_loss": 0.29370424151420593 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3643988370895386, + "epoch": 9.38, + "learning_rate": 3.451676528599606e-06, + "loss": 0.5589, + "step": 11095, + "task_loss": 1.444654941558838 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3300498425960541, + "epoch": 9.38, + "learning_rate": 3.4469803700572937e-06, + "loss": 0.4094, + "step": 11096, + "task_loss": 0.37557774782180786 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7911975979804993, + "epoch": 9.38, + "learning_rate": 3.442284211514981e-06, + "loss": 0.4712, + "step": 11097, + "task_loss": 0.48998376727104187 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6135490536689758, + "epoch": 9.38, + "learning_rate": 3.4375880529726685e-06, + "loss": 0.602, + "step": 11098, + "task_loss": 0.38733822107315063 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.32408007979393005, + "epoch": 9.38, + "learning_rate": 3.432891894430356e-06, + "loss": 0.4412, + "step": 11099, + "task_loss": 0.664608359336853 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.29378288984298706, + "epoch": 9.38, + "learning_rate": 3.4281957358880437e-06, + "loss": 0.5081, + "step": 11100, + "task_loss": 0.09347495436668396 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4468725323677063, + "epoch": 9.38, + "learning_rate": 3.4234995773457313e-06, + "loss": 0.4573, + "step": 11101, + "task_loss": 1.1402078866958618 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8350475430488586, + "epoch": 9.38, + "learning_rate": 3.4188034188034193e-06, + "loss": 0.5234, + "step": 11102, + "task_loss": 0.6529058814048767 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6792205572128296, + "epoch": 9.39, + "learning_rate": 3.414107260261107e-06, + "loss": 0.6285, + "step": 11103, + "task_loss": 0.9224597215652466 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8460720777511597, + "epoch": 9.39, + "learning_rate": 3.4094111017187945e-06, + "loss": 0.775, + "step": 11104, + "task_loss": 1.1045620441436768 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3041406273841858, + "epoch": 9.39, + "learning_rate": 3.4047149431764817e-06, + "loss": 0.4995, + "step": 11105, + "task_loss": 0.6274885535240173 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5086324214935303, + "epoch": 9.39, + "learning_rate": 3.4000187846341693e-06, + "loss": 0.5194, + "step": 11106, + "task_loss": 1.2403982877731323 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6446537971496582, + "epoch": 9.39, + "learning_rate": 3.395322626091857e-06, + "loss": 0.4995, + "step": 11107, + "task_loss": 1.1975113153457642 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6286758184432983, + "epoch": 9.39, + "learning_rate": 3.3906264675495446e-06, + "loss": 0.6297, + "step": 11108, + "task_loss": 0.6870454549789429 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3783208131790161, + "epoch": 9.39, + "learning_rate": 3.3859303090072318e-06, + "loss": 0.4738, + "step": 11109, + "task_loss": 0.7632800936698914 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9395403861999512, + "epoch": 9.39, + "learning_rate": 3.38123415046492e-06, + "loss": 0.7192, + "step": 11110, + "task_loss": 0.7955145835876465 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5888002514839172, + "epoch": 9.39, + "learning_rate": 3.376537991922608e-06, + "loss": 0.4269, + "step": 11111, + "task_loss": 1.1337482929229736 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.2262974977493286, + "epoch": 9.39, + "learning_rate": 3.3718418333802954e-06, + "loss": 0.7263, + "step": 11112, + "task_loss": 1.088402271270752 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.27186477184295654, + "epoch": 9.39, + "learning_rate": 3.3671456748379826e-06, + "loss": 0.4708, + "step": 11113, + "task_loss": 0.032991018146276474 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7451769113540649, + "epoch": 9.39, + "learning_rate": 3.3624495162956702e-06, + "loss": 0.5645, + "step": 11114, + "task_loss": 1.7511882781982422 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3677648901939392, + "epoch": 9.4, + "learning_rate": 3.357753357753358e-06, + "loss": 0.4433, + "step": 11115, + "task_loss": 0.2919082045555115 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.23110246658325195, + "epoch": 9.4, + "learning_rate": 3.3530571992110454e-06, + "loss": 0.4979, + "step": 11116, + "task_loss": 0.49213844537734985 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.44743162393569946, + "epoch": 9.4, + "learning_rate": 3.3483610406687326e-06, + "loss": 0.4855, + "step": 11117, + "task_loss": 0.579852819442749 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6172415614128113, + "epoch": 9.4, + "learning_rate": 3.343664882126421e-06, + "loss": 0.5107, + "step": 11118, + "task_loss": 0.899348795413971 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5544959306716919, + "epoch": 9.4, + "learning_rate": 3.3389687235841087e-06, + "loss": 0.5974, + "step": 11119, + "task_loss": 0.2919134199619293 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.2875138223171234, + "epoch": 9.4, + "learning_rate": 3.334272565041796e-06, + "loss": 0.5386, + "step": 11120, + "task_loss": 0.5029345154762268 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.46017682552337646, + "epoch": 9.4, + "learning_rate": 3.3295764064994835e-06, + "loss": 0.3596, + "step": 11121, + "task_loss": 0.40966182947158813 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4596652388572693, + "epoch": 9.4, + "learning_rate": 3.324880247957171e-06, + "loss": 0.5156, + "step": 11122, + "task_loss": 1.057257890701294 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.46278709173202515, + "epoch": 9.4, + "learning_rate": 3.3201840894148587e-06, + "loss": 0.5099, + "step": 11123, + "task_loss": 0.43311333656311035 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.42865991592407227, + "epoch": 9.4, + "learning_rate": 3.3154879308725463e-06, + "loss": 0.5467, + "step": 11124, + "task_loss": 0.2056272029876709 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5136969685554504, + "epoch": 9.4, + "learning_rate": 3.3107917723302335e-06, + "loss": 0.5897, + "step": 11125, + "task_loss": 0.6179768443107605 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6414124965667725, + "epoch": 9.4, + "learning_rate": 3.306095613787922e-06, + "loss": 0.7125, + "step": 11126, + "task_loss": 1.4045681953430176 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6912810206413269, + "epoch": 9.41, + "learning_rate": 3.3013994552456096e-06, + "loss": 0.4868, + "step": 11127, + "task_loss": 0.502131462097168 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6321882009506226, + "epoch": 9.41, + "learning_rate": 3.2967032967032968e-06, + "loss": 0.6458, + "step": 11128, + "task_loss": 0.8637170791625977 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3912673592567444, + "epoch": 9.41, + "learning_rate": 3.2920071381609844e-06, + "loss": 0.5482, + "step": 11129, + "task_loss": 0.05519155412912369 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8124796152114868, + "epoch": 9.41, + "learning_rate": 3.287310979618672e-06, + "loss": 0.6199, + "step": 11130, + "task_loss": 0.4740569293498993 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5340747833251953, + "epoch": 9.41, + "learning_rate": 3.2826148210763596e-06, + "loss": 0.5758, + "step": 11131, + "task_loss": 1.6151121854782104 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3482801914215088, + "epoch": 9.41, + "learning_rate": 3.277918662534047e-06, + "loss": 0.5309, + "step": 11132, + "task_loss": 0.9393897652626038 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.33681902289390564, + "epoch": 9.41, + "learning_rate": 3.2732225039917352e-06, + "loss": 0.3607, + "step": 11133, + "task_loss": 0.3562697172164917 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6691262722015381, + "epoch": 9.41, + "learning_rate": 3.268526345449423e-06, + "loss": 0.5117, + "step": 11134, + "task_loss": 0.4653528034687042 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.39411160349845886, + "epoch": 9.41, + "learning_rate": 3.2638301869071105e-06, + "loss": 0.5078, + "step": 11135, + "task_loss": 0.8771699666976929 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.2250085324048996, + "epoch": 9.41, + "learning_rate": 3.2591340283647977e-06, + "loss": 0.5092, + "step": 11136, + "task_loss": 0.8824301958084106 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.39933061599731445, + "epoch": 9.41, + "learning_rate": 3.2544378698224853e-06, + "loss": 0.519, + "step": 11137, + "task_loss": 0.5607483386993408 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5592647194862366, + "epoch": 9.41, + "learning_rate": 3.249741711280173e-06, + "loss": 0.4436, + "step": 11138, + "task_loss": 0.7491999864578247 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4985281825065613, + "epoch": 9.42, + "learning_rate": 3.2450455527378605e-06, + "loss": 0.6113, + "step": 11139, + "task_loss": 0.47277867794036865 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.41220033168792725, + "epoch": 9.42, + "learning_rate": 3.240349394195548e-06, + "loss": 0.5791, + "step": 11140, + "task_loss": 1.0582389831542969 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.33387482166290283, + "epoch": 9.42, + "learning_rate": 3.235653235653236e-06, + "loss": 0.5485, + "step": 11141, + "task_loss": 0.7039389610290527 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6312485337257385, + "epoch": 9.42, + "learning_rate": 3.2309570771109237e-06, + "loss": 0.6898, + "step": 11142, + "task_loss": 0.5599284172058105 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5976980924606323, + "epoch": 9.42, + "learning_rate": 3.2262609185686113e-06, + "loss": 0.6298, + "step": 11143, + "task_loss": 0.30890753865242004 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.295779287815094, + "epoch": 9.42, + "learning_rate": 3.2215647600262985e-06, + "loss": 0.5604, + "step": 11144, + "task_loss": 0.5468045473098755 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5590115785598755, + "epoch": 9.42, + "learning_rate": 3.216868601483986e-06, + "loss": 0.5013, + "step": 11145, + "task_loss": 0.49378541111946106 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5546669363975525, + "epoch": 9.42, + "learning_rate": 3.2121724429416738e-06, + "loss": 0.6615, + "step": 11146, + "task_loss": 0.7564154267311096 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6785353422164917, + "epoch": 9.42, + "learning_rate": 3.2074762843993614e-06, + "loss": 0.554, + "step": 11147, + "task_loss": 0.40302467346191406 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7306913733482361, + "epoch": 9.42, + "learning_rate": 3.2027801258570485e-06, + "loss": 0.5921, + "step": 11148, + "task_loss": 0.6420811414718628 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.48053300380706787, + "epoch": 9.42, + "learning_rate": 3.198083967314737e-06, + "loss": 0.5613, + "step": 11149, + "task_loss": 0.5979509353637695 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.668728232383728, + "epoch": 9.42, + "learning_rate": 3.1933878087724246e-06, + "loss": 0.6341, + "step": 11150, + "task_loss": 1.4296324253082275 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3514482378959656, + "epoch": 9.43, + "learning_rate": 3.1886916502301122e-06, + "loss": 0.5504, + "step": 11151, + "task_loss": 0.8765593767166138 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3960622549057007, + "epoch": 9.43, + "learning_rate": 3.1839954916877994e-06, + "loss": 0.5836, + "step": 11152, + "task_loss": 0.6594244837760925 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9185369610786438, + "epoch": 9.43, + "learning_rate": 3.179299333145487e-06, + "loss": 0.6816, + "step": 11153, + "task_loss": 0.5660412311553955 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4780644178390503, + "epoch": 9.43, + "learning_rate": 3.1746031746031746e-06, + "loss": 0.5219, + "step": 11154, + "task_loss": 0.5521951913833618 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3349250853061676, + "epoch": 9.43, + "learning_rate": 3.1699070160608622e-06, + "loss": 0.4432, + "step": 11155, + "task_loss": 0.407823383808136 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4655912518501282, + "epoch": 9.43, + "learning_rate": 3.1652108575185503e-06, + "loss": 0.514, + "step": 11156, + "task_loss": 0.4712706208229065 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5052464604377747, + "epoch": 9.43, + "learning_rate": 3.160514698976238e-06, + "loss": 0.5995, + "step": 11157, + "task_loss": 0.9926379323005676 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.731526255607605, + "epoch": 9.43, + "learning_rate": 3.1558185404339255e-06, + "loss": 0.5824, + "step": 11158, + "task_loss": 0.7762466669082642 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.37927356362342834, + "epoch": 9.43, + "learning_rate": 3.1511223818916127e-06, + "loss": 0.6127, + "step": 11159, + "task_loss": 0.8265427350997925 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3676943778991699, + "epoch": 9.43, + "learning_rate": 3.1464262233493003e-06, + "loss": 0.4114, + "step": 11160, + "task_loss": 0.25177186727523804 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.35482627153396606, + "epoch": 9.43, + "learning_rate": 3.141730064806988e-06, + "loss": 0.4682, + "step": 11161, + "task_loss": 0.4318864941596985 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4356091022491455, + "epoch": 9.44, + "learning_rate": 3.1370339062646755e-06, + "loss": 0.5349, + "step": 11162, + "task_loss": 0.6081743836402893 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7963196039199829, + "epoch": 9.44, + "learning_rate": 3.132337747722363e-06, + "loss": 0.5166, + "step": 11163, + "task_loss": 1.1281956434249878 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4117113947868347, + "epoch": 9.44, + "learning_rate": 3.127641589180051e-06, + "loss": 0.5554, + "step": 11164, + "task_loss": 0.8243807554244995 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3230505585670471, + "epoch": 9.44, + "learning_rate": 3.1229454306377383e-06, + "loss": 0.4225, + "step": 11165, + "task_loss": 0.9352537989616394 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.35018107295036316, + "epoch": 9.44, + "learning_rate": 3.1182492720954264e-06, + "loss": 0.5008, + "step": 11166, + "task_loss": 1.1259442567825317 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5738670825958252, + "epoch": 9.44, + "learning_rate": 3.1135531135531136e-06, + "loss": 0.4466, + "step": 11167, + "task_loss": 0.24371032416820526 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5985697507858276, + "epoch": 9.44, + "learning_rate": 3.108856955010801e-06, + "loss": 0.5247, + "step": 11168, + "task_loss": 0.48119136691093445 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5480961799621582, + "epoch": 9.44, + "learning_rate": 3.1041607964684888e-06, + "loss": 0.6098, + "step": 11169, + "task_loss": 1.2812097072601318 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5549170970916748, + "epoch": 9.44, + "learning_rate": 3.099464637926177e-06, + "loss": 0.6009, + "step": 11170, + "task_loss": 0.3722454905509949 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5054013729095459, + "epoch": 9.44, + "learning_rate": 3.094768479383864e-06, + "loss": 0.5076, + "step": 11171, + "task_loss": 1.0438300371170044 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3445597290992737, + "epoch": 9.44, + "learning_rate": 3.0900723208415516e-06, + "loss": 0.5084, + "step": 11172, + "task_loss": 0.5532470345497131 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8288873434066772, + "epoch": 9.44, + "learning_rate": 3.0853761622992392e-06, + "loss": 0.5571, + "step": 11173, + "task_loss": 1.179551362991333 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3122905194759369, + "epoch": 9.45, + "learning_rate": 3.0806800037569273e-06, + "loss": 0.6211, + "step": 11174, + "task_loss": 0.8374273777008057 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4194033741950989, + "epoch": 9.45, + "learning_rate": 3.0759838452146144e-06, + "loss": 0.4235, + "step": 11175, + "task_loss": 0.7910700440406799 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5038051009178162, + "epoch": 9.45, + "learning_rate": 3.071287686672302e-06, + "loss": 0.6776, + "step": 11176, + "task_loss": 0.6810777187347412 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.30673205852508545, + "epoch": 9.45, + "learning_rate": 3.0665915281299897e-06, + "loss": 0.3794, + "step": 11177, + "task_loss": 0.2551667094230652 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.26858779788017273, + "epoch": 9.45, + "learning_rate": 3.0618953695876777e-06, + "loss": 0.5452, + "step": 11178, + "task_loss": 0.5609457492828369 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6627442836761475, + "epoch": 9.45, + "learning_rate": 3.057199211045365e-06, + "loss": 0.6484, + "step": 11179, + "task_loss": 0.4815748929977417 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5693459510803223, + "epoch": 9.45, + "learning_rate": 3.0525030525030525e-06, + "loss": 0.693, + "step": 11180, + "task_loss": 0.14833194017410278 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.27992063760757446, + "epoch": 9.45, + "learning_rate": 3.0478068939607405e-06, + "loss": 0.4317, + "step": 11181, + "task_loss": 0.9693751335144043 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6363602876663208, + "epoch": 9.45, + "learning_rate": 3.043110735418428e-06, + "loss": 0.7418, + "step": 11182, + "task_loss": 1.4462608098983765 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4637390375137329, + "epoch": 9.45, + "learning_rate": 3.0384145768761153e-06, + "loss": 0.5685, + "step": 11183, + "task_loss": 0.48982736468315125 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7309938073158264, + "epoch": 9.45, + "learning_rate": 3.033718418333803e-06, + "loss": 0.6088, + "step": 11184, + "task_loss": 0.9512088894844055 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5189377069473267, + "epoch": 9.45, + "learning_rate": 3.029022259791491e-06, + "loss": 0.5628, + "step": 11185, + "task_loss": 0.7732096910476685 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.13867291808128357, + "epoch": 9.46, + "learning_rate": 3.0243261012491786e-06, + "loss": 0.3426, + "step": 11186, + "task_loss": 0.043049365282058716 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8073858022689819, + "epoch": 9.46, + "learning_rate": 3.0196299427068658e-06, + "loss": 0.5108, + "step": 11187, + "task_loss": 0.7730945944786072 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3671865165233612, + "epoch": 9.46, + "learning_rate": 3.0149337841645534e-06, + "loss": 0.5131, + "step": 11188, + "task_loss": 0.6448646187782288 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.48400965332984924, + "epoch": 9.46, + "learning_rate": 3.0102376256222414e-06, + "loss": 0.5077, + "step": 11189, + "task_loss": 1.7643741369247437 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5058324337005615, + "epoch": 9.46, + "learning_rate": 3.0055414670799286e-06, + "loss": 0.4617, + "step": 11190, + "task_loss": 1.250806212425232 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.38470160961151123, + "epoch": 9.46, + "learning_rate": 3.0008453085376162e-06, + "loss": 0.4523, + "step": 11191, + "task_loss": 0.4217795729637146 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5489324331283569, + "epoch": 9.46, + "learning_rate": 2.996149149995304e-06, + "loss": 0.6275, + "step": 11192, + "task_loss": 0.5095259547233582 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4338130056858063, + "epoch": 9.46, + "learning_rate": 2.991452991452992e-06, + "loss": 0.5194, + "step": 11193, + "task_loss": 0.5813493132591248 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.39201757311820984, + "epoch": 9.46, + "learning_rate": 2.986756832910679e-06, + "loss": 0.4791, + "step": 11194, + "task_loss": 1.7503974437713623 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4456171989440918, + "epoch": 9.46, + "learning_rate": 2.9820606743683667e-06, + "loss": 0.4746, + "step": 11195, + "task_loss": 1.2226107120513916 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.39763957262039185, + "epoch": 9.46, + "learning_rate": 2.9773645158260543e-06, + "loss": 0.4075, + "step": 11196, + "task_loss": 0.5088104009628296 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.42229801416397095, + "epoch": 9.46, + "learning_rate": 2.9726683572837423e-06, + "loss": 0.4595, + "step": 11197, + "task_loss": 0.8628992438316345 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4571329951286316, + "epoch": 9.47, + "learning_rate": 2.9679721987414295e-06, + "loss": 0.4993, + "step": 11198, + "task_loss": 0.5981459021568298 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5947855710983276, + "epoch": 9.47, + "learning_rate": 2.963276040199117e-06, + "loss": 0.5615, + "step": 11199, + "task_loss": 0.8464884757995605 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4146347641944885, + "epoch": 9.47, + "learning_rate": 2.9585798816568047e-06, + "loss": 0.553, + "step": 11200, + "task_loss": 0.535025954246521 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.797527015209198, + "epoch": 9.47, + "learning_rate": 2.9538837231144927e-06, + "loss": 0.5237, + "step": 11201, + "task_loss": 0.6189848184585571 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.30893757939338684, + "epoch": 9.47, + "learning_rate": 2.94918756457218e-06, + "loss": 0.4715, + "step": 11202, + "task_loss": 0.028159160166978836 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.49830928444862366, + "epoch": 9.47, + "learning_rate": 2.9444914060298675e-06, + "loss": 0.5356, + "step": 11203, + "task_loss": 1.4245926141738892 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7145611047744751, + "epoch": 9.47, + "learning_rate": 2.939795247487555e-06, + "loss": 0.5594, + "step": 11204, + "task_loss": 0.6180378198623657 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.17425081133842468, + "epoch": 9.47, + "learning_rate": 2.935099088945243e-06, + "loss": 0.4694, + "step": 11205, + "task_loss": 0.06243056431412697 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3144095838069916, + "epoch": 9.47, + "learning_rate": 2.9304029304029304e-06, + "loss": 0.5122, + "step": 11206, + "task_loss": 0.11671505123376846 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4932126998901367, + "epoch": 9.47, + "learning_rate": 2.925706771860618e-06, + "loss": 0.4326, + "step": 11207, + "task_loss": 0.2425106316804886 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9185200333595276, + "epoch": 9.47, + "learning_rate": 2.921010613318306e-06, + "loss": 0.6074, + "step": 11208, + "task_loss": 1.4237382411956787 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6256715059280396, + "epoch": 9.47, + "learning_rate": 2.9163144547759936e-06, + "loss": 0.6569, + "step": 11209, + "task_loss": 1.1951192617416382 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8609659075737, + "epoch": 9.48, + "learning_rate": 2.911618296233681e-06, + "loss": 0.591, + "step": 11210, + "task_loss": 0.6378014087677002 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5100657939910889, + "epoch": 9.48, + "learning_rate": 2.9069221376913684e-06, + "loss": 0.6524, + "step": 11211, + "task_loss": 0.2991871237754822 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.563478410243988, + "epoch": 9.48, + "learning_rate": 2.9022259791490565e-06, + "loss": 0.5434, + "step": 11212, + "task_loss": 0.3929310142993927 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4076765477657318, + "epoch": 9.48, + "learning_rate": 2.897529820606744e-06, + "loss": 0.4575, + "step": 11213, + "task_loss": 0.3963709771633148 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.29768604040145874, + "epoch": 9.48, + "learning_rate": 2.8928336620644312e-06, + "loss": 0.3552, + "step": 11214, + "task_loss": 0.5723297595977783 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.2555665373802185, + "epoch": 9.48, + "learning_rate": 2.888137503522119e-06, + "loss": 0.4123, + "step": 11215, + "task_loss": 0.22902387380599976 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5171318650245667, + "epoch": 9.48, + "learning_rate": 2.883441344979807e-06, + "loss": 0.4955, + "step": 11216, + "task_loss": 0.9553980827331543 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4600731134414673, + "epoch": 9.48, + "learning_rate": 2.8787451864374945e-06, + "loss": 0.6287, + "step": 11217, + "task_loss": 0.5804479122161865 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.414058655500412, + "epoch": 9.48, + "learning_rate": 2.8740490278951817e-06, + "loss": 0.5244, + "step": 11218, + "task_loss": 0.20023566484451294 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.47961902618408203, + "epoch": 9.48, + "learning_rate": 2.8693528693528693e-06, + "loss": 0.4695, + "step": 11219, + "task_loss": 0.9276801347732544 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5101318955421448, + "epoch": 9.48, + "learning_rate": 2.8646567108105573e-06, + "loss": 0.4577, + "step": 11220, + "task_loss": 0.4558369517326355 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.699172854423523, + "epoch": 9.48, + "learning_rate": 2.859960552268245e-06, + "loss": 0.6084, + "step": 11221, + "task_loss": 1.4894967079162598 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3634474277496338, + "epoch": 9.49, + "learning_rate": 2.855264393725932e-06, + "loss": 0.4718, + "step": 11222, + "task_loss": 0.5079518556594849 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5747938752174377, + "epoch": 9.49, + "learning_rate": 2.8505682351836197e-06, + "loss": 0.5409, + "step": 11223, + "task_loss": 0.9425478577613831 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7430768013000488, + "epoch": 9.49, + "learning_rate": 2.8458720766413078e-06, + "loss": 0.6543, + "step": 11224, + "task_loss": 1.360588788986206 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5270208120346069, + "epoch": 9.49, + "learning_rate": 2.8411759180989954e-06, + "loss": 0.6836, + "step": 11225, + "task_loss": 0.9736447930335999 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6401659250259399, + "epoch": 9.49, + "learning_rate": 2.8364797595566826e-06, + "loss": 0.4673, + "step": 11226, + "task_loss": 0.48319482803344727 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.693993330001831, + "epoch": 9.49, + "learning_rate": 2.83178360101437e-06, + "loss": 0.4836, + "step": 11227, + "task_loss": 1.4331079721450806 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5109017491340637, + "epoch": 9.49, + "learning_rate": 2.8270874424720582e-06, + "loss": 0.5796, + "step": 11228, + "task_loss": 0.126139834523201 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7835152745246887, + "epoch": 9.49, + "learning_rate": 2.8223912839297454e-06, + "loss": 0.5748, + "step": 11229, + "task_loss": 0.43810319900512695 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4638989269733429, + "epoch": 9.49, + "learning_rate": 2.817695125387433e-06, + "loss": 0.4891, + "step": 11230, + "task_loss": 0.5794159770011902 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5640575289726257, + "epoch": 9.49, + "learning_rate": 2.8129989668451206e-06, + "loss": 0.4949, + "step": 11231, + "task_loss": 0.422950804233551 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6392498016357422, + "epoch": 9.49, + "learning_rate": 2.8083028083028087e-06, + "loss": 0.4538, + "step": 11232, + "task_loss": 0.8638476729393005 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8038769960403442, + "epoch": 9.5, + "learning_rate": 2.803606649760496e-06, + "loss": 0.5792, + "step": 11233, + "task_loss": 1.003324270248413 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3015412390232086, + "epoch": 9.5, + "learning_rate": 2.7989104912181835e-06, + "loss": 0.4918, + "step": 11234, + "task_loss": 0.31251290440559387 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4549716114997864, + "epoch": 9.5, + "learning_rate": 2.7942143326758715e-06, + "loss": 0.3199, + "step": 11235, + "task_loss": 0.7041872143745422 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6487836837768555, + "epoch": 9.5, + "learning_rate": 2.789518174133559e-06, + "loss": 0.5095, + "step": 11236, + "task_loss": 0.5114790201187134 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5082249045372009, + "epoch": 9.5, + "learning_rate": 2.7848220155912463e-06, + "loss": 0.4761, + "step": 11237, + "task_loss": 0.34324270486831665 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5402510762214661, + "epoch": 9.5, + "learning_rate": 2.780125857048934e-06, + "loss": 0.4301, + "step": 11238, + "task_loss": 1.1007641553878784 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5648353099822998, + "epoch": 9.5, + "learning_rate": 2.775429698506622e-06, + "loss": 0.5534, + "step": 11239, + "task_loss": 1.6882665157318115 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.724153995513916, + "epoch": 9.5, + "learning_rate": 2.7707335399643095e-06, + "loss": 0.7064, + "step": 11240, + "task_loss": 0.3488437831401825 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4707251787185669, + "epoch": 9.5, + "learning_rate": 2.7660373814219967e-06, + "loss": 0.5252, + "step": 11241, + "task_loss": 0.8056434988975525 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3624878227710724, + "epoch": 9.5, + "learning_rate": 2.7613412228796843e-06, + "loss": 0.5243, + "step": 11242, + "task_loss": 1.3109062910079956 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.46566590666770935, + "epoch": 9.5, + "learning_rate": 2.7566450643373724e-06, + "loss": 0.6326, + "step": 11243, + "task_loss": 1.012470006942749 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.391556978225708, + "epoch": 9.5, + "learning_rate": 2.75194890579506e-06, + "loss": 0.5736, + "step": 11244, + "task_loss": 0.32321351766586304 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.49198341369628906, + "epoch": 9.51, + "learning_rate": 2.747252747252747e-06, + "loss": 0.4948, + "step": 11245, + "task_loss": 0.6251381039619446 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3979555070400238, + "epoch": 9.51, + "learning_rate": 2.7425565887104348e-06, + "loss": 0.5793, + "step": 11246, + "task_loss": 0.2804372310638428 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5097079873085022, + "epoch": 9.51, + "learning_rate": 2.737860430168123e-06, + "loss": 0.5428, + "step": 11247, + "task_loss": 1.2022968530654907 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6925046443939209, + "epoch": 9.51, + "learning_rate": 2.7331642716258104e-06, + "loss": 0.4705, + "step": 11248, + "task_loss": 0.45006683468818665 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.46897977590560913, + "epoch": 9.51, + "learning_rate": 2.7284681130834976e-06, + "loss": 0.4792, + "step": 11249, + "task_loss": 0.30467385053634644 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6085516214370728, + "epoch": 9.51, + "learning_rate": 2.7237719545411852e-06, + "loss": 0.5106, + "step": 11250, + "task_loss": 1.0562021732330322 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5249451398849487, + "epoch": 9.51, + "learning_rate": 2.7190757959988733e-06, + "loss": 0.7262, + "step": 11251, + "task_loss": 0.4065154492855072 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5905717015266418, + "epoch": 9.51, + "learning_rate": 2.714379637456561e-06, + "loss": 0.4682, + "step": 11252, + "task_loss": 0.44764643907546997 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4434654116630554, + "epoch": 9.51, + "learning_rate": 2.709683478914248e-06, + "loss": 0.3969, + "step": 11253, + "task_loss": 0.8883944749832153 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9155362248420715, + "epoch": 9.51, + "learning_rate": 2.7049873203719357e-06, + "loss": 0.7403, + "step": 11254, + "task_loss": 1.9760304689407349 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.2908838391304016, + "epoch": 9.51, + "learning_rate": 2.7002911618296237e-06, + "loss": 0.4126, + "step": 11255, + "task_loss": 0.2298804372549057 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3395985960960388, + "epoch": 9.51, + "learning_rate": 2.6955950032873113e-06, + "loss": 0.4989, + "step": 11256, + "task_loss": 0.6705290079116821 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6614788174629211, + "epoch": 9.52, + "learning_rate": 2.6908988447449985e-06, + "loss": 0.4866, + "step": 11257, + "task_loss": 0.5600391626358032 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4609816074371338, + "epoch": 9.52, + "learning_rate": 2.6862026862026865e-06, + "loss": 0.5995, + "step": 11258, + "task_loss": 0.5131989121437073 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3335059583187103, + "epoch": 9.52, + "learning_rate": 2.681506527660374e-06, + "loss": 0.4402, + "step": 11259, + "task_loss": 0.5076111555099487 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4667609930038452, + "epoch": 9.52, + "learning_rate": 2.6768103691180617e-06, + "loss": 0.459, + "step": 11260, + "task_loss": 0.5480345487594604 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5583664178848267, + "epoch": 9.52, + "learning_rate": 2.672114210575749e-06, + "loss": 0.5342, + "step": 11261, + "task_loss": 0.49604201316833496 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5955600142478943, + "epoch": 9.52, + "learning_rate": 2.667418052033437e-06, + "loss": 0.5301, + "step": 11262, + "task_loss": 0.6410098671913147 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.064489483833313, + "epoch": 9.52, + "learning_rate": 2.6627218934911246e-06, + "loss": 0.595, + "step": 11263, + "task_loss": 0.5184519290924072 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4310872554779053, + "epoch": 9.52, + "learning_rate": 2.658025734948812e-06, + "loss": 0.496, + "step": 11264, + "task_loss": 0.7246324419975281 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3592475950717926, + "epoch": 9.52, + "learning_rate": 2.6533295764064994e-06, + "loss": 0.452, + "step": 11265, + "task_loss": 0.1540917158126831 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6691977381706238, + "epoch": 9.52, + "learning_rate": 2.6486334178641874e-06, + "loss": 0.6251, + "step": 11266, + "task_loss": 0.608206033706665 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7982854843139648, + "epoch": 9.52, + "learning_rate": 2.643937259321875e-06, + "loss": 0.5489, + "step": 11267, + "task_loss": 1.2630400657653809 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5251617431640625, + "epoch": 9.52, + "learning_rate": 2.639241100779562e-06, + "loss": 0.5587, + "step": 11268, + "task_loss": 0.5851114988327026 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6244267821311951, + "epoch": 9.53, + "learning_rate": 2.63454494223725e-06, + "loss": 0.4604, + "step": 11269, + "task_loss": 0.6942358613014221 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3187119662761688, + "epoch": 9.53, + "learning_rate": 2.629848783694938e-06, + "loss": 0.5795, + "step": 11270, + "task_loss": 0.3793101906776428 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.47814345359802246, + "epoch": 9.53, + "learning_rate": 2.6251526251526255e-06, + "loss": 0.5729, + "step": 11271, + "task_loss": 0.982180655002594 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.24039261043071747, + "epoch": 9.53, + "learning_rate": 2.6204564666103126e-06, + "loss": 0.3678, + "step": 11272, + "task_loss": 0.14001110196113586 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6196792125701904, + "epoch": 9.53, + "learning_rate": 2.6157603080680003e-06, + "loss": 0.5614, + "step": 11273, + "task_loss": 0.9769834280014038 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6720485687255859, + "epoch": 9.53, + "learning_rate": 2.6110641495256883e-06, + "loss": 0.4556, + "step": 11274, + "task_loss": 0.5077792406082153 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4280302822589874, + "epoch": 9.53, + "learning_rate": 2.606367990983376e-06, + "loss": 0.5717, + "step": 11275, + "task_loss": 0.4765695333480835 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3548336625099182, + "epoch": 9.53, + "learning_rate": 2.601671832441063e-06, + "loss": 0.4716, + "step": 11276, + "task_loss": 0.23736508190631866 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.2945220172405243, + "epoch": 9.53, + "learning_rate": 2.5969756738987507e-06, + "loss": 0.4943, + "step": 11277, + "task_loss": 0.08198618143796921 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4968324303627014, + "epoch": 9.53, + "learning_rate": 2.5922795153564387e-06, + "loss": 0.4399, + "step": 11278, + "task_loss": 0.7858983874320984 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3508654534816742, + "epoch": 9.53, + "learning_rate": 2.5875833568141263e-06, + "loss": 0.4329, + "step": 11279, + "task_loss": 1.3209810256958008 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.31715792417526245, + "epoch": 9.53, + "learning_rate": 2.5828871982718135e-06, + "loss": 0.4148, + "step": 11280, + "task_loss": 0.45970574021339417 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6455689072608948, + "epoch": 9.54, + "learning_rate": 2.578191039729501e-06, + "loss": 0.5056, + "step": 11281, + "task_loss": 2.1212663650512695 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6583182215690613, + "epoch": 9.54, + "learning_rate": 2.573494881187189e-06, + "loss": 0.7417, + "step": 11282, + "task_loss": 0.46663668751716614 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7477010488510132, + "epoch": 9.54, + "learning_rate": 2.5687987226448768e-06, + "loss": 0.6183, + "step": 11283, + "task_loss": 1.1823301315307617 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.591083824634552, + "epoch": 9.54, + "learning_rate": 2.564102564102564e-06, + "loss": 0.4667, + "step": 11284, + "task_loss": 0.5827910900115967 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.41403722763061523, + "epoch": 9.54, + "learning_rate": 2.559406405560252e-06, + "loss": 0.5062, + "step": 11285, + "task_loss": 1.3135507106781006 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5059385895729065, + "epoch": 9.54, + "learning_rate": 2.5547102470179396e-06, + "loss": 0.5766, + "step": 11286, + "task_loss": 0.7129079699516296 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4497736096382141, + "epoch": 9.54, + "learning_rate": 2.5500140884756272e-06, + "loss": 0.5952, + "step": 11287, + "task_loss": 0.3709377348423004 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9262888431549072, + "epoch": 9.54, + "learning_rate": 2.5453179299333144e-06, + "loss": 0.6886, + "step": 11288, + "task_loss": 1.3071322441101074 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4404759407043457, + "epoch": 9.54, + "learning_rate": 2.5406217713910024e-06, + "loss": 0.4869, + "step": 11289, + "task_loss": 0.5400684475898743 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5740737915039062, + "epoch": 9.54, + "learning_rate": 2.53592561284869e-06, + "loss": 0.6466, + "step": 11290, + "task_loss": 1.1421723365783691 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3339034914970398, + "epoch": 9.54, + "learning_rate": 2.5312294543063777e-06, + "loss": 0.4779, + "step": 11291, + "task_loss": 0.5302637219429016 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3346141278743744, + "epoch": 9.54, + "learning_rate": 2.526533295764065e-06, + "loss": 0.4762, + "step": 11292, + "task_loss": 1.3013646602630615 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.065484881401062, + "epoch": 9.55, + "learning_rate": 2.521837137221753e-06, + "loss": 0.6517, + "step": 11293, + "task_loss": 0.6578007340431213 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7507924437522888, + "epoch": 9.55, + "learning_rate": 2.5171409786794405e-06, + "loss": 0.5953, + "step": 11294, + "task_loss": 0.45313555002212524 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3916409909725189, + "epoch": 9.55, + "learning_rate": 2.512444820137128e-06, + "loss": 0.4647, + "step": 11295, + "task_loss": 0.05582456290721893 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.91286301612854, + "epoch": 9.55, + "learning_rate": 2.5077486615948153e-06, + "loss": 0.595, + "step": 11296, + "task_loss": 0.8451696634292603 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.29353976249694824, + "epoch": 9.55, + "learning_rate": 2.5030525030525033e-06, + "loss": 0.484, + "step": 11297, + "task_loss": 0.8092113137245178 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4351484477519989, + "epoch": 9.55, + "learning_rate": 2.498356344510191e-06, + "loss": 0.5125, + "step": 11298, + "task_loss": 0.7138547897338867 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5433275699615479, + "epoch": 9.55, + "learning_rate": 2.4936601859678785e-06, + "loss": 0.4803, + "step": 11299, + "task_loss": 0.9670594334602356 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4451245069503784, + "epoch": 9.55, + "learning_rate": 2.4889640274255657e-06, + "loss": 0.6376, + "step": 11300, + "task_loss": 0.4943684935569763 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6339473724365234, + "epoch": 9.55, + "learning_rate": 2.4842678688832538e-06, + "loss": 0.6747, + "step": 11301, + "task_loss": 1.3400665521621704 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3447936773300171, + "epoch": 9.55, + "learning_rate": 2.4795717103409414e-06, + "loss": 0.5308, + "step": 11302, + "task_loss": 0.5028912425041199 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7479475736618042, + "epoch": 9.55, + "learning_rate": 2.474875551798629e-06, + "loss": 0.4428, + "step": 11303, + "task_loss": 0.39573919773101807 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.43042832612991333, + "epoch": 9.56, + "learning_rate": 2.470179393256316e-06, + "loss": 0.6145, + "step": 11304, + "task_loss": 0.48770764470100403 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6956771612167358, + "epoch": 9.56, + "learning_rate": 2.465483234714004e-06, + "loss": 0.4791, + "step": 11305, + "task_loss": 0.5823579430580139 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.40375658869743347, + "epoch": 9.56, + "learning_rate": 2.460787076171692e-06, + "loss": 0.5237, + "step": 11306, + "task_loss": 0.7839794158935547 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5448088645935059, + "epoch": 9.56, + "learning_rate": 2.456090917629379e-06, + "loss": 0.4768, + "step": 11307, + "task_loss": 0.505507230758667 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.35148441791534424, + "epoch": 9.56, + "learning_rate": 2.4513947590870666e-06, + "loss": 0.5431, + "step": 11308, + "task_loss": 0.5940706133842468 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6245949268341064, + "epoch": 9.56, + "learning_rate": 2.4466986005447546e-06, + "loss": 0.5536, + "step": 11309, + "task_loss": 0.6330739259719849 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.27357620000839233, + "epoch": 9.56, + "learning_rate": 2.4420024420024423e-06, + "loss": 0.4598, + "step": 11310, + "task_loss": 0.3091890513896942 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.37369900941848755, + "epoch": 9.56, + "learning_rate": 2.4373062834601294e-06, + "loss": 0.5864, + "step": 11311, + "task_loss": 0.637654185295105 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.27571484446525574, + "epoch": 9.56, + "learning_rate": 2.4326101249178175e-06, + "loss": 0.5353, + "step": 11312, + "task_loss": 0.8123959898948669 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5271096229553223, + "epoch": 9.56, + "learning_rate": 2.427913966375505e-06, + "loss": 0.5632, + "step": 11313, + "task_loss": 0.6251791715621948 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4978424310684204, + "epoch": 9.56, + "learning_rate": 2.4232178078331927e-06, + "loss": 0.5198, + "step": 11314, + "task_loss": 1.0911293029785156 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.47109702229499817, + "epoch": 9.56, + "learning_rate": 2.41852164929088e-06, + "loss": 0.5242, + "step": 11315, + "task_loss": 0.7716821432113647 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.501494288444519, + "epoch": 9.57, + "learning_rate": 2.413825490748568e-06, + "loss": 0.5089, + "step": 11316, + "task_loss": 0.7109674215316772 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.679937481880188, + "epoch": 9.57, + "learning_rate": 2.4091293322062555e-06, + "loss": 0.4764, + "step": 11317, + "task_loss": 0.4361709654331207 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.35518068075180054, + "epoch": 9.57, + "learning_rate": 2.404433173663943e-06, + "loss": 0.467, + "step": 11318, + "task_loss": 0.7475529909133911 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.568520188331604, + "epoch": 9.57, + "learning_rate": 2.3997370151216303e-06, + "loss": 0.544, + "step": 11319, + "task_loss": 0.4720578193664551 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3220349848270416, + "epoch": 9.57, + "learning_rate": 2.3950408565793184e-06, + "loss": 0.4714, + "step": 11320, + "task_loss": 0.7138441801071167 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4054856598377228, + "epoch": 9.57, + "learning_rate": 2.390344698037006e-06, + "loss": 0.5885, + "step": 11321, + "task_loss": 0.4209631383419037 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5548774600028992, + "epoch": 9.57, + "learning_rate": 2.3856485394946936e-06, + "loss": 0.4313, + "step": 11322, + "task_loss": 0.3082820475101471 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4837498366832733, + "epoch": 9.57, + "learning_rate": 2.3809523809523808e-06, + "loss": 0.4914, + "step": 11323, + "task_loss": 0.40177401900291443 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4989627003669739, + "epoch": 9.57, + "learning_rate": 2.376256222410069e-06, + "loss": 0.8006, + "step": 11324, + "task_loss": 0.6497212648391724 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4094073176383972, + "epoch": 9.57, + "learning_rate": 2.3715600638677564e-06, + "loss": 0.4625, + "step": 11325, + "task_loss": 0.4828144311904907 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.40828120708465576, + "epoch": 9.57, + "learning_rate": 2.366863905325444e-06, + "loss": 0.5198, + "step": 11326, + "task_loss": 1.229005217552185 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3773658573627472, + "epoch": 9.57, + "learning_rate": 2.362167746783131e-06, + "loss": 0.4491, + "step": 11327, + "task_loss": 0.20502349734306335 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5077579617500305, + "epoch": 9.58, + "learning_rate": 2.3574715882408192e-06, + "loss": 0.5436, + "step": 11328, + "task_loss": 0.8033504486083984 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7943429946899414, + "epoch": 9.58, + "learning_rate": 2.352775429698507e-06, + "loss": 0.559, + "step": 11329, + "task_loss": 0.4793477952480316 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3950050473213196, + "epoch": 9.58, + "learning_rate": 2.3480792711561945e-06, + "loss": 0.4086, + "step": 11330, + "task_loss": 0.730297863483429 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4508638083934784, + "epoch": 9.58, + "learning_rate": 2.3433831126138816e-06, + "loss": 0.3725, + "step": 11331, + "task_loss": 0.7669834494590759 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.416669636964798, + "epoch": 9.58, + "learning_rate": 2.3386869540715697e-06, + "loss": 0.4423, + "step": 11332, + "task_loss": 0.4165184497833252 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6083411574363708, + "epoch": 9.58, + "learning_rate": 2.3339907955292573e-06, + "loss": 0.5312, + "step": 11333, + "task_loss": 0.9701744318008423 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.48547062277793884, + "epoch": 9.58, + "learning_rate": 2.329294636986945e-06, + "loss": 0.4196, + "step": 11334, + "task_loss": 0.09378504753112793 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6530546545982361, + "epoch": 9.58, + "learning_rate": 2.324598478444632e-06, + "loss": 0.526, + "step": 11335, + "task_loss": 0.6071787476539612 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.44750842452049255, + "epoch": 9.58, + "learning_rate": 2.31990231990232e-06, + "loss": 0.4795, + "step": 11336, + "task_loss": 0.4807664453983307 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4324527382850647, + "epoch": 9.58, + "learning_rate": 2.3152061613600077e-06, + "loss": 0.498, + "step": 11337, + "task_loss": 0.028464380651712418 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.45919033885002136, + "epoch": 9.58, + "learning_rate": 2.3105100028176953e-06, + "loss": 0.6393, + "step": 11338, + "task_loss": 1.1389330625534058 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.26782023906707764, + "epoch": 9.58, + "learning_rate": 2.305813844275383e-06, + "loss": 0.3574, + "step": 11339, + "task_loss": 0.4858621060848236 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7754696607589722, + "epoch": 9.59, + "learning_rate": 2.3011176857330706e-06, + "loss": 0.5754, + "step": 11340, + "task_loss": 0.22865431010723114 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6703416109085083, + "epoch": 9.59, + "learning_rate": 2.296421527190758e-06, + "loss": 0.607, + "step": 11341, + "task_loss": 1.3635218143463135 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4295088052749634, + "epoch": 9.59, + "learning_rate": 2.2917253686484458e-06, + "loss": 0.5457, + "step": 11342, + "task_loss": 0.8019813299179077 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.49722224473953247, + "epoch": 9.59, + "learning_rate": 2.2870292101061334e-06, + "loss": 0.5803, + "step": 11343, + "task_loss": 0.5000154376029968 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6082929372787476, + "epoch": 9.59, + "learning_rate": 2.282333051563821e-06, + "loss": 0.5924, + "step": 11344, + "task_loss": 1.2574076652526855 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.39489513635635376, + "epoch": 9.59, + "learning_rate": 2.2776368930215086e-06, + "loss": 0.4818, + "step": 11345, + "task_loss": 0.6837270855903625 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8449753522872925, + "epoch": 9.59, + "learning_rate": 2.272940734479196e-06, + "loss": 0.709, + "step": 11346, + "task_loss": 1.0779989957809448 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3333572447299957, + "epoch": 9.59, + "learning_rate": 2.268244575936884e-06, + "loss": 0.5152, + "step": 11347, + "task_loss": 0.21496634185314178 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5039761662483215, + "epoch": 9.59, + "learning_rate": 2.2635484173945714e-06, + "loss": 0.4809, + "step": 11348, + "task_loss": 1.1403034925460815 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.45210978388786316, + "epoch": 9.59, + "learning_rate": 2.258852258852259e-06, + "loss": 0.5549, + "step": 11349, + "task_loss": 0.4723094403743744 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6134293079376221, + "epoch": 9.59, + "learning_rate": 2.2541561003099462e-06, + "loss": 0.5576, + "step": 11350, + "task_loss": 0.9434386491775513 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.442374587059021, + "epoch": 9.59, + "learning_rate": 2.2494599417676343e-06, + "loss": 0.6503, + "step": 11351, + "task_loss": 0.2529735267162323 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5978698134422302, + "epoch": 9.6, + "learning_rate": 2.244763783225322e-06, + "loss": 0.5574, + "step": 11352, + "task_loss": 0.43565264344215393 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6101328730583191, + "epoch": 9.6, + "learning_rate": 2.2400676246830095e-06, + "loss": 0.5711, + "step": 11353, + "task_loss": 0.9153672456741333 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7736935019493103, + "epoch": 9.6, + "learning_rate": 2.2353714661406967e-06, + "loss": 0.5049, + "step": 11354, + "task_loss": 0.26655423641204834 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5841749906539917, + "epoch": 9.6, + "learning_rate": 2.2306753075983847e-06, + "loss": 0.6524, + "step": 11355, + "task_loss": 1.1711784601211548 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.49926966428756714, + "epoch": 9.6, + "learning_rate": 2.2259791490560723e-06, + "loss": 0.5932, + "step": 11356, + "task_loss": 1.5748735666275024 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.23277291655540466, + "epoch": 9.6, + "learning_rate": 2.22128299051376e-06, + "loss": 0.4584, + "step": 11357, + "task_loss": 0.8467684984207153 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5206531286239624, + "epoch": 9.6, + "learning_rate": 2.216586831971447e-06, + "loss": 0.4915, + "step": 11358, + "task_loss": 0.45865365862846375 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4101598858833313, + "epoch": 9.6, + "learning_rate": 2.211890673429135e-06, + "loss": 0.4704, + "step": 11359, + "task_loss": 0.8500105142593384 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6126411557197571, + "epoch": 9.6, + "learning_rate": 2.2071945148868228e-06, + "loss": 0.562, + "step": 11360, + "task_loss": 1.0517301559448242 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.50624680519104, + "epoch": 9.6, + "learning_rate": 2.2024983563445104e-06, + "loss": 0.5617, + "step": 11361, + "task_loss": 0.7073647379875183 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3042835295200348, + "epoch": 9.6, + "learning_rate": 2.197802197802198e-06, + "loss": 0.3655, + "step": 11362, + "task_loss": 0.21476058661937714 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4579913914203644, + "epoch": 9.6, + "learning_rate": 2.1931060392598856e-06, + "loss": 0.4399, + "step": 11363, + "task_loss": 0.38955485820770264 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8641185760498047, + "epoch": 9.61, + "learning_rate": 2.188409880717573e-06, + "loss": 0.6753, + "step": 11364, + "task_loss": 0.8704508543014526 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5644605755805969, + "epoch": 9.61, + "learning_rate": 2.183713722175261e-06, + "loss": 0.4579, + "step": 11365, + "task_loss": 0.7010282278060913 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8767354488372803, + "epoch": 9.61, + "learning_rate": 2.1790175636329484e-06, + "loss": 0.6482, + "step": 11366, + "task_loss": 1.1051936149597168 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.41551673412323, + "epoch": 9.61, + "learning_rate": 2.174321405090636e-06, + "loss": 0.3793, + "step": 11367, + "task_loss": 0.4250624179840088 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3770271837711334, + "epoch": 9.61, + "learning_rate": 2.1696252465483236e-06, + "loss": 0.346, + "step": 11368, + "task_loss": 0.6807384490966797 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.2927069067955017, + "epoch": 9.61, + "learning_rate": 2.1649290880060113e-06, + "loss": 0.4418, + "step": 11369, + "task_loss": 0.10040730983018875 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4206555187702179, + "epoch": 9.61, + "learning_rate": 2.160232929463699e-06, + "loss": 0.5424, + "step": 11370, + "task_loss": 0.02807283028960228 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5097132921218872, + "epoch": 9.61, + "learning_rate": 2.1555367709213865e-06, + "loss": 0.4996, + "step": 11371, + "task_loss": 0.14528372883796692 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.212254285812378, + "epoch": 9.61, + "learning_rate": 2.150840612379074e-06, + "loss": 0.77, + "step": 11372, + "task_loss": 1.0149073600769043 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3413701057434082, + "epoch": 9.61, + "learning_rate": 2.1461444538367617e-06, + "loss": 0.4637, + "step": 11373, + "task_loss": 0.5785795450210571 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.2077215015888214, + "epoch": 9.61, + "learning_rate": 2.1414482952944493e-06, + "loss": 0.5194, + "step": 11374, + "task_loss": 0.4637937545776367 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5268356800079346, + "epoch": 9.61, + "learning_rate": 2.136752136752137e-06, + "loss": 0.5422, + "step": 11375, + "task_loss": 0.41784340143203735 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4584645926952362, + "epoch": 9.62, + "learning_rate": 2.1320559782098245e-06, + "loss": 0.5946, + "step": 11376, + "task_loss": 0.7697169780731201 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.42890146374702454, + "epoch": 9.62, + "learning_rate": 2.127359819667512e-06, + "loss": 0.4079, + "step": 11377, + "task_loss": 0.5323693156242371 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.34654513001441956, + "epoch": 9.62, + "learning_rate": 2.1226636611251998e-06, + "loss": 0.4402, + "step": 11378, + "task_loss": 0.5559029579162598 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7698297500610352, + "epoch": 9.62, + "learning_rate": 2.1179675025828874e-06, + "loss": 0.5272, + "step": 11379, + "task_loss": 0.4025132954120636 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8065383434295654, + "epoch": 9.62, + "learning_rate": 2.113271344040575e-06, + "loss": 0.6317, + "step": 11380, + "task_loss": 0.722902774810791 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6418182849884033, + "epoch": 9.62, + "learning_rate": 2.1085751854982626e-06, + "loss": 0.5592, + "step": 11381, + "task_loss": 0.7413223385810852 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6465228796005249, + "epoch": 9.62, + "learning_rate": 2.10387902695595e-06, + "loss": 0.5976, + "step": 11382, + "task_loss": 0.6990059018135071 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5830527544021606, + "epoch": 9.62, + "learning_rate": 2.099182868413638e-06, + "loss": 0.6527, + "step": 11383, + "task_loss": 1.2063448429107666 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.35674935579299927, + "epoch": 9.62, + "learning_rate": 2.0944867098713254e-06, + "loss": 0.546, + "step": 11384, + "task_loss": 0.754609227180481 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6285964250564575, + "epoch": 9.62, + "learning_rate": 2.0897905513290126e-06, + "loss": 0.5898, + "step": 11385, + "task_loss": 0.421176016330719 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8485491275787354, + "epoch": 9.62, + "learning_rate": 2.0850943927867006e-06, + "loss": 0.5845, + "step": 11386, + "task_loss": 0.7623026371002197 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.36015641689300537, + "epoch": 9.63, + "learning_rate": 2.0803982342443882e-06, + "loss": 0.5057, + "step": 11387, + "task_loss": 0.5767558217048645 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5697177648544312, + "epoch": 9.63, + "learning_rate": 2.075702075702076e-06, + "loss": 0.6388, + "step": 11388, + "task_loss": 1.2033848762512207 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5609574317932129, + "epoch": 9.63, + "learning_rate": 2.0710059171597635e-06, + "loss": 0.5668, + "step": 11389, + "task_loss": 1.1689881086349487 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3769562542438507, + "epoch": 9.63, + "learning_rate": 2.066309758617451e-06, + "loss": 0.3853, + "step": 11390, + "task_loss": 0.627342700958252 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9064114093780518, + "epoch": 9.63, + "learning_rate": 2.0616136000751387e-06, + "loss": 0.8, + "step": 11391, + "task_loss": 1.698434829711914 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4534938335418701, + "epoch": 9.63, + "learning_rate": 2.0569174415328263e-06, + "loss": 0.5359, + "step": 11392, + "task_loss": 0.3029283881187439 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.27526289224624634, + "epoch": 9.63, + "learning_rate": 2.052221282990514e-06, + "loss": 0.5997, + "step": 11393, + "task_loss": 0.6870337724685669 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6988538503646851, + "epoch": 9.63, + "learning_rate": 2.0475251244482015e-06, + "loss": 0.4933, + "step": 11394, + "task_loss": 1.2264176607131958 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5033267736434937, + "epoch": 9.63, + "learning_rate": 2.042828965905889e-06, + "loss": 0.5801, + "step": 11395, + "task_loss": 0.28577178716659546 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.2712342441082001, + "epoch": 9.63, + "learning_rate": 2.0381328073635767e-06, + "loss": 0.4701, + "step": 11396, + "task_loss": 0.041305821388959885 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6948150992393494, + "epoch": 9.63, + "learning_rate": 2.0334366488212643e-06, + "loss": 0.5206, + "step": 11397, + "task_loss": 0.2705252170562744 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5131797790527344, + "epoch": 9.63, + "learning_rate": 2.028740490278952e-06, + "loss": 0.5664, + "step": 11398, + "task_loss": 0.9425686597824097 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7910622358322144, + "epoch": 9.64, + "learning_rate": 2.0240443317366396e-06, + "loss": 0.623, + "step": 11399, + "task_loss": 0.373815655708313 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.2773650884628296, + "epoch": 9.64, + "learning_rate": 2.019348173194327e-06, + "loss": 0.4221, + "step": 11400, + "task_loss": 0.4856134355068207 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.34700414538383484, + "epoch": 9.64, + "learning_rate": 2.0146520146520148e-06, + "loss": 0.535, + "step": 11401, + "task_loss": 0.5109906196594238 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.36448150873184204, + "epoch": 9.64, + "learning_rate": 2.0099558561097024e-06, + "loss": 0.4651, + "step": 11402, + "task_loss": 0.30071714520454407 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.48587363958358765, + "epoch": 9.64, + "learning_rate": 2.00525969756739e-06, + "loss": 0.4584, + "step": 11403, + "task_loss": 0.7365081310272217 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.317171573638916, + "epoch": 9.64, + "learning_rate": 2.0005635390250776e-06, + "loss": 0.5336, + "step": 11404, + "task_loss": 0.5099436044692993 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4770129323005676, + "epoch": 9.64, + "learning_rate": 1.9958673804827652e-06, + "loss": 0.3974, + "step": 11405, + "task_loss": 0.9173585176467896 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.387908935546875, + "epoch": 9.64, + "learning_rate": 1.991171221940453e-06, + "loss": 0.5447, + "step": 11406, + "task_loss": 0.6485097408294678 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.2975608706474304, + "epoch": 9.64, + "learning_rate": 1.9864750633981404e-06, + "loss": 0.6818, + "step": 11407, + "task_loss": 0.5057646632194519 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8049953579902649, + "epoch": 9.64, + "learning_rate": 1.981778904855828e-06, + "loss": 0.6527, + "step": 11408, + "task_loss": 0.6023550033569336 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4158722758293152, + "epoch": 9.64, + "learning_rate": 1.9770827463135157e-06, + "loss": 0.4384, + "step": 11409, + "task_loss": 0.8810976147651672 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6338034272193909, + "epoch": 9.64, + "learning_rate": 1.9723865877712033e-06, + "loss": 0.5533, + "step": 11410, + "task_loss": 0.422211229801178 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.20183518528938293, + "epoch": 9.65, + "learning_rate": 1.967690429228891e-06, + "loss": 0.5289, + "step": 11411, + "task_loss": 0.18101917207241058 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3412478566169739, + "epoch": 9.65, + "learning_rate": 1.9629942706865785e-06, + "loss": 0.3614, + "step": 11412, + "task_loss": 1.0988258123397827 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6582709550857544, + "epoch": 9.65, + "learning_rate": 1.958298112144266e-06, + "loss": 0.532, + "step": 11413, + "task_loss": 0.9655368328094482 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3807488679885864, + "epoch": 9.65, + "learning_rate": 1.9536019536019537e-06, + "loss": 0.351, + "step": 11414, + "task_loss": 0.0759316012263298 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.36646178364753723, + "epoch": 9.65, + "learning_rate": 1.9489057950596413e-06, + "loss": 0.5103, + "step": 11415, + "task_loss": 0.6569719910621643 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.46015650033950806, + "epoch": 9.65, + "learning_rate": 1.944209636517329e-06, + "loss": 0.4993, + "step": 11416, + "task_loss": 0.3372751772403717 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.46350613236427307, + "epoch": 9.65, + "learning_rate": 1.9395134779750165e-06, + "loss": 0.4979, + "step": 11417, + "task_loss": 0.3964754343032837 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7152127623558044, + "epoch": 9.65, + "learning_rate": 1.934817319432704e-06, + "loss": 0.4895, + "step": 11418, + "task_loss": 0.2720735967159271 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.432297021150589, + "epoch": 9.65, + "learning_rate": 1.9301211608903918e-06, + "loss": 0.5948, + "step": 11419, + "task_loss": 0.48711010813713074 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7634737491607666, + "epoch": 9.65, + "learning_rate": 1.9254250023480794e-06, + "loss": 0.7019, + "step": 11420, + "task_loss": 0.8792864680290222 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6778569221496582, + "epoch": 9.65, + "learning_rate": 1.920728843805767e-06, + "loss": 0.4817, + "step": 11421, + "task_loss": 0.6431885957717896 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3309305012226105, + "epoch": 9.65, + "learning_rate": 1.9160326852634546e-06, + "loss": 0.4711, + "step": 11422, + "task_loss": 0.019695384427905083 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3831704258918762, + "epoch": 9.66, + "learning_rate": 1.911336526721142e-06, + "loss": 0.5397, + "step": 11423, + "task_loss": 0.31559550762176514 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4908909499645233, + "epoch": 9.66, + "learning_rate": 1.90664036817883e-06, + "loss": 0.5216, + "step": 11424, + "task_loss": 0.5803307890892029 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6039223074913025, + "epoch": 9.66, + "learning_rate": 1.9019442096365174e-06, + "loss": 0.5381, + "step": 11425, + "task_loss": 0.9097698330879211 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3814285099506378, + "epoch": 9.66, + "learning_rate": 1.897248051094205e-06, + "loss": 0.5373, + "step": 11426, + "task_loss": 0.3421352505683899 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3630097508430481, + "epoch": 9.66, + "learning_rate": 1.8925518925518924e-06, + "loss": 0.4927, + "step": 11427, + "task_loss": 0.5205262303352356 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3492605686187744, + "epoch": 9.66, + "learning_rate": 1.8878557340095805e-06, + "loss": 0.6895, + "step": 11428, + "task_loss": 0.4407202899456024 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.39480993151664734, + "epoch": 9.66, + "learning_rate": 1.8831595754672679e-06, + "loss": 0.5088, + "step": 11429, + "task_loss": 0.9788179993629456 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.31586894392967224, + "epoch": 9.66, + "learning_rate": 1.8784634169249555e-06, + "loss": 0.4813, + "step": 11430, + "task_loss": 0.6838805079460144 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8324129581451416, + "epoch": 9.66, + "learning_rate": 1.8737672583826429e-06, + "loss": 0.5918, + "step": 11431, + "task_loss": 0.7188927531242371 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4706302285194397, + "epoch": 9.66, + "learning_rate": 1.869071099840331e-06, + "loss": 0.4293, + "step": 11432, + "task_loss": 0.5183086395263672 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5760665535926819, + "epoch": 9.66, + "learning_rate": 1.8643749412980183e-06, + "loss": 0.5366, + "step": 11433, + "task_loss": 1.4357893466949463 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6407067179679871, + "epoch": 9.66, + "learning_rate": 1.859678782755706e-06, + "loss": 0.5939, + "step": 11434, + "task_loss": 0.7987834215164185 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.37229210138320923, + "epoch": 9.67, + "learning_rate": 1.8549826242133933e-06, + "loss": 0.5008, + "step": 11435, + "task_loss": 0.283566415309906 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3849557340145111, + "epoch": 9.67, + "learning_rate": 1.8502864656710811e-06, + "loss": 0.5514, + "step": 11436, + "task_loss": 0.3423067033290863 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6533241271972656, + "epoch": 9.67, + "learning_rate": 1.8455903071287688e-06, + "loss": 0.488, + "step": 11437, + "task_loss": 1.2742427587509155 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.25612151622772217, + "epoch": 9.67, + "learning_rate": 1.8408941485864564e-06, + "loss": 0.3962, + "step": 11438, + "task_loss": 0.12319464981555939 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7691036462783813, + "epoch": 9.67, + "learning_rate": 1.8361979900441438e-06, + "loss": 0.6162, + "step": 11439, + "task_loss": 1.5722628831863403 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6252167224884033, + "epoch": 9.67, + "learning_rate": 1.8315018315018316e-06, + "loss": 0.54, + "step": 11440, + "task_loss": 0.35404253005981445 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5172237753868103, + "epoch": 9.67, + "learning_rate": 1.8268056729595192e-06, + "loss": 0.607, + "step": 11441, + "task_loss": 0.9308105707168579 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.27958208322525024, + "epoch": 9.67, + "learning_rate": 1.8221095144172068e-06, + "loss": 0.5482, + "step": 11442, + "task_loss": 0.5811054110527039 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5526999831199646, + "epoch": 9.67, + "learning_rate": 1.8174133558748946e-06, + "loss": 0.5631, + "step": 11443, + "task_loss": 0.7441183924674988 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5582119226455688, + "epoch": 9.67, + "learning_rate": 1.812717197332582e-06, + "loss": 0.6481, + "step": 11444, + "task_loss": 1.1997815370559692 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7663733959197998, + "epoch": 9.67, + "learning_rate": 1.8080210387902696e-06, + "loss": 0.5549, + "step": 11445, + "task_loss": 1.0760046243667603 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.48026636242866516, + "epoch": 9.67, + "learning_rate": 1.8033248802479572e-06, + "loss": 0.5647, + "step": 11446, + "task_loss": 0.5653250813484192 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5247679352760315, + "epoch": 9.68, + "learning_rate": 1.798628721705645e-06, + "loss": 0.5945, + "step": 11447, + "task_loss": 1.5842952728271484 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5619928240776062, + "epoch": 9.68, + "learning_rate": 1.7939325631633325e-06, + "loss": 0.5334, + "step": 11448, + "task_loss": 0.11431525647640228 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.38217905163764954, + "epoch": 9.68, + "learning_rate": 1.78923640462102e-06, + "loss": 0.4343, + "step": 11449, + "task_loss": 0.17117930948734283 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.2910740375518799, + "epoch": 9.68, + "learning_rate": 1.7845402460787075e-06, + "loss": 0.6241, + "step": 11450, + "task_loss": 1.0246496200561523 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3700796663761139, + "epoch": 9.68, + "learning_rate": 1.7798440875363955e-06, + "loss": 0.423, + "step": 11451, + "task_loss": 0.9428666830062866 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.28177887201309204, + "epoch": 9.68, + "learning_rate": 1.775147928994083e-06, + "loss": 0.5776, + "step": 11452, + "task_loss": 0.48674842715263367 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.40159133076667786, + "epoch": 9.68, + "learning_rate": 1.7704517704517705e-06, + "loss": 0.5967, + "step": 11453, + "task_loss": 1.018568754196167 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5420736074447632, + "epoch": 9.68, + "learning_rate": 1.765755611909458e-06, + "loss": 0.4989, + "step": 11454, + "task_loss": 0.30141696333885193 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8138407468795776, + "epoch": 9.68, + "learning_rate": 1.761059453367146e-06, + "loss": 0.5488, + "step": 11455, + "task_loss": 0.5972869992256165 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6012736558914185, + "epoch": 9.68, + "learning_rate": 1.7563632948248333e-06, + "loss": 0.7249, + "step": 11456, + "task_loss": 0.6163088083267212 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6256277561187744, + "epoch": 9.68, + "learning_rate": 1.751667136282521e-06, + "loss": 0.5429, + "step": 11457, + "task_loss": 0.18946219980716705 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.2902298867702484, + "epoch": 9.69, + "learning_rate": 1.7469709777402084e-06, + "loss": 0.4499, + "step": 11458, + "task_loss": 0.518046498298645 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6199654340744019, + "epoch": 9.69, + "learning_rate": 1.7422748191978964e-06, + "loss": 0.4597, + "step": 11459, + "task_loss": 0.2770633101463318 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4486556649208069, + "epoch": 9.69, + "learning_rate": 1.7375786606555838e-06, + "loss": 0.5637, + "step": 11460, + "task_loss": 1.0077593326568604 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.26702263951301575, + "epoch": 9.69, + "learning_rate": 1.7328825021132714e-06, + "loss": 0.4551, + "step": 11461, + "task_loss": 1.410670280456543 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.356871098279953, + "epoch": 9.69, + "learning_rate": 1.7281863435709588e-06, + "loss": 0.5788, + "step": 11462, + "task_loss": 0.6124656200408936 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7186338901519775, + "epoch": 9.69, + "learning_rate": 1.7234901850286468e-06, + "loss": 0.6833, + "step": 11463, + "task_loss": 0.5351365208625793 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5803250670433044, + "epoch": 9.69, + "learning_rate": 1.7187940264863342e-06, + "loss": 0.6266, + "step": 11464, + "task_loss": 0.41014429926872253 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6916131973266602, + "epoch": 9.69, + "learning_rate": 1.7140978679440218e-06, + "loss": 0.6371, + "step": 11465, + "task_loss": 0.7057671546936035 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4370342791080475, + "epoch": 9.69, + "learning_rate": 1.7094017094017097e-06, + "loss": 0.5309, + "step": 11466, + "task_loss": 0.4182102382183075 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5826621651649475, + "epoch": 9.69, + "learning_rate": 1.7047055508593973e-06, + "loss": 0.588, + "step": 11467, + "task_loss": 0.2382158488035202 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8353487253189087, + "epoch": 9.69, + "learning_rate": 1.7000093923170847e-06, + "loss": 0.5951, + "step": 11468, + "task_loss": 0.9908468127250671 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6293667554855347, + "epoch": 9.69, + "learning_rate": 1.6953132337747723e-06, + "loss": 0.6013, + "step": 11469, + "task_loss": 0.7203791737556458 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.1999642848968506, + "epoch": 9.7, + "learning_rate": 1.69061707523246e-06, + "loss": 0.6438, + "step": 11470, + "task_loss": 0.9174990653991699 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.47182023525238037, + "epoch": 9.7, + "learning_rate": 1.6859209166901477e-06, + "loss": 0.5863, + "step": 11471, + "task_loss": 0.2513229548931122 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4197593927383423, + "epoch": 9.7, + "learning_rate": 1.6812247581478351e-06, + "loss": 0.4886, + "step": 11472, + "task_loss": 0.3842479884624481 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.35015416145324707, + "epoch": 9.7, + "learning_rate": 1.6765285996055227e-06, + "loss": 0.5745, + "step": 11473, + "task_loss": 0.5927641987800598 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5260219573974609, + "epoch": 9.7, + "learning_rate": 1.6718324410632105e-06, + "loss": 0.4678, + "step": 11474, + "task_loss": 0.2036322057247162 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7272312641143799, + "epoch": 9.7, + "learning_rate": 1.667136282520898e-06, + "loss": 0.6014, + "step": 11475, + "task_loss": 0.299737811088562 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.46678540110588074, + "epoch": 9.7, + "learning_rate": 1.6624401239785856e-06, + "loss": 0.4281, + "step": 11476, + "task_loss": 0.5127730965614319 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.33670303225517273, + "epoch": 9.7, + "learning_rate": 1.6577439654362732e-06, + "loss": 0.5122, + "step": 11477, + "task_loss": 0.3591921627521515 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4550689458847046, + "epoch": 9.7, + "learning_rate": 1.653047806893961e-06, + "loss": 0.4194, + "step": 11478, + "task_loss": 0.33631694316864014 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7339975833892822, + "epoch": 9.7, + "learning_rate": 1.6483516483516484e-06, + "loss": 0.5662, + "step": 11479, + "task_loss": 1.0520552396774292 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6990529298782349, + "epoch": 9.7, + "learning_rate": 1.643655489809336e-06, + "loss": 0.6022, + "step": 11480, + "task_loss": 1.287522792816162 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.36819988489151, + "epoch": 9.7, + "learning_rate": 1.6389593312670236e-06, + "loss": 0.4681, + "step": 11481, + "task_loss": 0.6823718547821045 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7517868280410767, + "epoch": 9.71, + "learning_rate": 1.6342631727247114e-06, + "loss": 0.6539, + "step": 11482, + "task_loss": 0.8537051677703857 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.49158775806427, + "epoch": 9.71, + "learning_rate": 1.6295670141823988e-06, + "loss": 0.6412, + "step": 11483, + "task_loss": 1.2917907238006592 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7884390354156494, + "epoch": 9.71, + "learning_rate": 1.6248708556400864e-06, + "loss": 0.7169, + "step": 11484, + "task_loss": 1.4464924335479736 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6628308892250061, + "epoch": 9.71, + "learning_rate": 1.620174697097774e-06, + "loss": 0.5296, + "step": 11485, + "task_loss": 0.9179601669311523 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.37098443508148193, + "epoch": 9.71, + "learning_rate": 1.6154785385554619e-06, + "loss": 0.4818, + "step": 11486, + "task_loss": 2.040372133255005 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4042167067527771, + "epoch": 9.71, + "learning_rate": 1.6107823800131493e-06, + "loss": 0.6047, + "step": 11487, + "task_loss": 0.5229744911193848 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8282198905944824, + "epoch": 9.71, + "learning_rate": 1.6060862214708369e-06, + "loss": 0.6515, + "step": 11488, + "task_loss": 1.1605839729309082 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5591453909873962, + "epoch": 9.71, + "learning_rate": 1.6013900629285243e-06, + "loss": 0.523, + "step": 11489, + "task_loss": 0.4052616059780121 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5347902178764343, + "epoch": 9.71, + "learning_rate": 1.5966939043862123e-06, + "loss": 0.6062, + "step": 11490, + "task_loss": 0.501980185508728 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5177268981933594, + "epoch": 9.71, + "learning_rate": 1.5919977458438997e-06, + "loss": 0.5345, + "step": 11491, + "task_loss": 0.1114128977060318 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.44069892168045044, + "epoch": 9.71, + "learning_rate": 1.5873015873015873e-06, + "loss": 0.5572, + "step": 11492, + "task_loss": 0.3993459641933441 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5387635231018066, + "epoch": 9.71, + "learning_rate": 1.5826054287592751e-06, + "loss": 0.6122, + "step": 11493, + "task_loss": 0.5208805203437805 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6882684230804443, + "epoch": 9.72, + "learning_rate": 1.5779092702169627e-06, + "loss": 0.6071, + "step": 11494, + "task_loss": 1.050876259803772 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.25800374150276184, + "epoch": 9.72, + "learning_rate": 1.5732131116746501e-06, + "loss": 0.5395, + "step": 11495, + "task_loss": 0.8320268392562866 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6293847560882568, + "epoch": 9.72, + "learning_rate": 1.5685169531323378e-06, + "loss": 0.6958, + "step": 11496, + "task_loss": 0.7482283115386963 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5255011916160583, + "epoch": 9.72, + "learning_rate": 1.5638207945900256e-06, + "loss": 0.5407, + "step": 11497, + "task_loss": 0.9688664674758911 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.41892117261886597, + "epoch": 9.72, + "learning_rate": 1.5591246360477132e-06, + "loss": 0.7519, + "step": 11498, + "task_loss": 0.41321319341659546 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4400743842124939, + "epoch": 9.72, + "learning_rate": 1.5544284775054006e-06, + "loss": 0.415, + "step": 11499, + "task_loss": 0.21726928651332855 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.390968918800354, + "epoch": 9.72, + "learning_rate": 1.5497323189630884e-06, + "loss": 0.4337, + "step": 11500, + "task_loss": 0.8967593312263489 + }, + { + "epoch": 9.72, + "eval_accuracy": 0.9057425742574258, + "eval_loss": 0.3511705696582794, + "eval_runtime": 227.2672, + "eval_samples_per_second": 111.103, + "eval_steps_per_second": 0.871, + "step": 11500 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5567768812179565, + "epoch": 9.72, + "learning_rate": 1.5450361604207758e-06, + "loss": 0.5434, + "step": 11501, + "task_loss": 0.2658913731575012 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.46981245279312134, + "epoch": 9.72, + "learning_rate": 1.5403400018784636e-06, + "loss": 0.4256, + "step": 11502, + "task_loss": 0.14546000957489014 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3608664572238922, + "epoch": 9.72, + "learning_rate": 1.535643843336151e-06, + "loss": 0.4488, + "step": 11503, + "task_loss": 0.5932294130325317 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.638058602809906, + "epoch": 9.72, + "learning_rate": 1.5309476847938389e-06, + "loss": 0.6472, + "step": 11504, + "task_loss": 0.14560745656490326 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.40062224864959717, + "epoch": 9.72, + "learning_rate": 1.5262515262515263e-06, + "loss": 0.4603, + "step": 11505, + "task_loss": 0.8395342826843262 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.2785366475582123, + "epoch": 9.73, + "learning_rate": 1.521555367709214e-06, + "loss": 0.3652, + "step": 11506, + "task_loss": 0.09998361021280289 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5614135265350342, + "epoch": 9.73, + "learning_rate": 1.5168592091669015e-06, + "loss": 0.4011, + "step": 11507, + "task_loss": 0.3780283033847809 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6255232691764832, + "epoch": 9.73, + "learning_rate": 1.5121630506245893e-06, + "loss": 0.56, + "step": 11508, + "task_loss": 1.5151031017303467 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.43301820755004883, + "epoch": 9.73, + "learning_rate": 1.5074668920822767e-06, + "loss": 0.4538, + "step": 11509, + "task_loss": 0.14908526837825775 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4144566059112549, + "epoch": 9.73, + "learning_rate": 1.5027707335399643e-06, + "loss": 0.6267, + "step": 11510, + "task_loss": 0.521141767501831 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5095493793487549, + "epoch": 9.73, + "learning_rate": 1.498074574997652e-06, + "loss": 0.6482, + "step": 11511, + "task_loss": 0.8237716555595398 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6125508546829224, + "epoch": 9.73, + "learning_rate": 1.4933784164553395e-06, + "loss": 0.5238, + "step": 11512, + "task_loss": 0.26352524757385254 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.437438040971756, + "epoch": 9.73, + "learning_rate": 1.4886822579130271e-06, + "loss": 0.6446, + "step": 11513, + "task_loss": 0.255998820066452 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.40522947907447815, + "epoch": 9.73, + "learning_rate": 1.4839860993707147e-06, + "loss": 0.5732, + "step": 11514, + "task_loss": 0.19258660078048706 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.41361743211746216, + "epoch": 9.73, + "learning_rate": 1.4792899408284024e-06, + "loss": 0.4055, + "step": 11515, + "task_loss": 0.2814204692840576 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5382248759269714, + "epoch": 9.73, + "learning_rate": 1.47459378228609e-06, + "loss": 0.4374, + "step": 11516, + "task_loss": 0.300976037979126 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.569606363773346, + "epoch": 9.73, + "learning_rate": 1.4698976237437776e-06, + "loss": 0.5846, + "step": 11517, + "task_loss": 1.4356436729431152 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.26649874448776245, + "epoch": 9.74, + "learning_rate": 1.4652014652014652e-06, + "loss": 0.4426, + "step": 11518, + "task_loss": 0.7886174321174622 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4209505319595337, + "epoch": 9.74, + "learning_rate": 1.460505306659153e-06, + "loss": 0.6332, + "step": 11519, + "task_loss": 0.3427625596523285 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.37186598777770996, + "epoch": 9.74, + "learning_rate": 1.4558091481168404e-06, + "loss": 0.4816, + "step": 11520, + "task_loss": 0.19172817468643188 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.35611990094184875, + "epoch": 9.74, + "learning_rate": 1.4511129895745282e-06, + "loss": 0.5433, + "step": 11521, + "task_loss": 0.2556125819683075 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8351103067398071, + "epoch": 9.74, + "learning_rate": 1.4464168310322156e-06, + "loss": 0.6653, + "step": 11522, + "task_loss": 0.6433969736099243 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4323684871196747, + "epoch": 9.74, + "learning_rate": 1.4417206724899034e-06, + "loss": 0.5032, + "step": 11523, + "task_loss": 0.5185337662696838 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4899367392063141, + "epoch": 9.74, + "learning_rate": 1.4370245139475908e-06, + "loss": 0.5382, + "step": 11524, + "task_loss": 0.4096483886241913 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4018746614456177, + "epoch": 9.74, + "learning_rate": 1.4323283554052787e-06, + "loss": 0.4922, + "step": 11525, + "task_loss": 0.870339035987854 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5089036226272583, + "epoch": 9.74, + "learning_rate": 1.427632196862966e-06, + "loss": 0.5942, + "step": 11526, + "task_loss": 1.0038331747055054 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5607277154922485, + "epoch": 9.74, + "learning_rate": 1.4229360383206539e-06, + "loss": 0.5483, + "step": 11527, + "task_loss": 0.5007983446121216 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.35664600133895874, + "epoch": 9.74, + "learning_rate": 1.4182398797783413e-06, + "loss": 0.5427, + "step": 11528, + "task_loss": 1.0948646068572998 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6890609860420227, + "epoch": 9.75, + "learning_rate": 1.4135437212360291e-06, + "loss": 0.5119, + "step": 11529, + "task_loss": 0.6640431880950928 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6069000363349915, + "epoch": 9.75, + "learning_rate": 1.4088475626937165e-06, + "loss": 0.4472, + "step": 11530, + "task_loss": 0.28757908940315247 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5472804307937622, + "epoch": 9.75, + "learning_rate": 1.4041514041514043e-06, + "loss": 0.5132, + "step": 11531, + "task_loss": 0.5858628153800964 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3309595584869385, + "epoch": 9.75, + "learning_rate": 1.3994552456090917e-06, + "loss": 0.4889, + "step": 11532, + "task_loss": 0.3597802221775055 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.0271484851837158, + "epoch": 9.75, + "learning_rate": 1.3947590870667795e-06, + "loss": 0.6455, + "step": 11533, + "task_loss": 1.0215789079666138 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.443034827709198, + "epoch": 9.75, + "learning_rate": 1.390062928524467e-06, + "loss": 0.4848, + "step": 11534, + "task_loss": 0.2977057695388794 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7900955080986023, + "epoch": 9.75, + "learning_rate": 1.3853667699821548e-06, + "loss": 0.481, + "step": 11535, + "task_loss": 1.3864778280258179 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.392417848110199, + "epoch": 9.75, + "learning_rate": 1.3806706114398422e-06, + "loss": 0.4273, + "step": 11536, + "task_loss": 0.5588244199752808 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6663950681686401, + "epoch": 9.75, + "learning_rate": 1.37597445289753e-06, + "loss": 0.5253, + "step": 11537, + "task_loss": 1.1801608800888062 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 1.243700623512268, + "epoch": 9.75, + "learning_rate": 1.3712782943552174e-06, + "loss": 0.742, + "step": 11538, + "task_loss": 1.4689083099365234 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4488237798213959, + "epoch": 9.75, + "learning_rate": 1.3665821358129052e-06, + "loss": 0.5058, + "step": 11539, + "task_loss": 0.1712009608745575 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.45357435941696167, + "epoch": 9.75, + "learning_rate": 1.3618859772705926e-06, + "loss": 0.569, + "step": 11540, + "task_loss": 0.6335695385932922 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4018586277961731, + "epoch": 9.76, + "learning_rate": 1.3571898187282804e-06, + "loss": 0.5226, + "step": 11541, + "task_loss": 0.15843738615512848 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5824781060218811, + "epoch": 9.76, + "learning_rate": 1.3524936601859678e-06, + "loss": 0.7349, + "step": 11542, + "task_loss": 1.4136425256729126 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5276750922203064, + "epoch": 9.76, + "learning_rate": 1.3477975016436557e-06, + "loss": 0.4919, + "step": 11543, + "task_loss": 0.6889218688011169 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5535430312156677, + "epoch": 9.76, + "learning_rate": 1.3431013431013433e-06, + "loss": 0.4041, + "step": 11544, + "task_loss": 1.014448881149292 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.483516126871109, + "epoch": 9.76, + "learning_rate": 1.3384051845590309e-06, + "loss": 0.5345, + "step": 11545, + "task_loss": 0.6694908142089844 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5338544249534607, + "epoch": 9.76, + "learning_rate": 1.3337090260167185e-06, + "loss": 0.5485, + "step": 11546, + "task_loss": 1.0113564729690552 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3086256980895996, + "epoch": 9.76, + "learning_rate": 1.329012867474406e-06, + "loss": 0.5753, + "step": 11547, + "task_loss": 0.6875080466270447 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.37699294090270996, + "epoch": 9.76, + "learning_rate": 1.3243167089320937e-06, + "loss": 0.4969, + "step": 11548, + "task_loss": 0.1622782051563263 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.561211347579956, + "epoch": 9.76, + "learning_rate": 1.319620550389781e-06, + "loss": 0.5388, + "step": 11549, + "task_loss": 1.0040934085845947 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4216953217983246, + "epoch": 9.76, + "learning_rate": 1.314924391847469e-06, + "loss": 0.4537, + "step": 11550, + "task_loss": 0.8901730179786682 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6656798720359802, + "epoch": 9.76, + "learning_rate": 1.3102282333051563e-06, + "loss": 0.5279, + "step": 11551, + "task_loss": 0.6245513558387756 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5316826105117798, + "epoch": 9.76, + "learning_rate": 1.3055320747628441e-06, + "loss": 0.6651, + "step": 11552, + "task_loss": 1.250679850578308 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3855898082256317, + "epoch": 9.77, + "learning_rate": 1.3008359162205315e-06, + "loss": 0.5085, + "step": 11553, + "task_loss": 1.1959214210510254 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3168773055076599, + "epoch": 9.77, + "learning_rate": 1.2961397576782194e-06, + "loss": 0.3888, + "step": 11554, + "task_loss": 0.569686233997345 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.651950478553772, + "epoch": 9.77, + "learning_rate": 1.2914435991359068e-06, + "loss": 0.5338, + "step": 11555, + "task_loss": 0.7176133990287781 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5993804931640625, + "epoch": 9.77, + "learning_rate": 1.2867474405935946e-06, + "loss": 0.4859, + "step": 11556, + "task_loss": 0.31746017932891846 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.31995171308517456, + "epoch": 9.77, + "learning_rate": 1.282051282051282e-06, + "loss": 0.477, + "step": 11557, + "task_loss": 0.662502110004425 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8163994550704956, + "epoch": 9.77, + "learning_rate": 1.2773551235089698e-06, + "loss": 0.6399, + "step": 11558, + "task_loss": 0.5642949938774109 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6283993721008301, + "epoch": 9.77, + "learning_rate": 1.2726589649666572e-06, + "loss": 0.6732, + "step": 11559, + "task_loss": 1.2210075855255127 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4720362424850464, + "epoch": 9.77, + "learning_rate": 1.267962806424345e-06, + "loss": 0.5127, + "step": 11560, + "task_loss": 0.5592157244682312 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6326269507408142, + "epoch": 9.77, + "learning_rate": 1.2632666478820324e-06, + "loss": 0.4377, + "step": 11561, + "task_loss": 0.5415890216827393 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7122247219085693, + "epoch": 9.77, + "learning_rate": 1.2585704893397202e-06, + "loss": 0.825, + "step": 11562, + "task_loss": 0.4452231824398041 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.38139837980270386, + "epoch": 9.77, + "learning_rate": 1.2538743307974076e-06, + "loss": 0.5184, + "step": 11563, + "task_loss": 1.0324598550796509 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5226012468338013, + "epoch": 9.77, + "learning_rate": 1.2491781722550955e-06, + "loss": 0.5152, + "step": 11564, + "task_loss": 0.3369823694229126 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6828691959381104, + "epoch": 9.78, + "learning_rate": 1.2444820137127829e-06, + "loss": 0.6023, + "step": 11565, + "task_loss": 0.8556783199310303 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.45525631308555603, + "epoch": 9.78, + "learning_rate": 1.2397858551704707e-06, + "loss": 0.3918, + "step": 11566, + "task_loss": 0.5870350003242493 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4529707133769989, + "epoch": 9.78, + "learning_rate": 1.235089696628158e-06, + "loss": 0.5443, + "step": 11567, + "task_loss": 0.3626823127269745 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.39108002185821533, + "epoch": 9.78, + "learning_rate": 1.230393538085846e-06, + "loss": 0.5113, + "step": 11568, + "task_loss": 1.2324045896530151 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6650864481925964, + "epoch": 9.78, + "learning_rate": 1.2256973795435333e-06, + "loss": 0.564, + "step": 11569, + "task_loss": 1.095869541168213 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6135939955711365, + "epoch": 9.78, + "learning_rate": 1.2210012210012211e-06, + "loss": 0.633, + "step": 11570, + "task_loss": 1.5814604759216309 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6984860301017761, + "epoch": 9.78, + "learning_rate": 1.2163050624589087e-06, + "loss": 0.4799, + "step": 11571, + "task_loss": 0.4044111669063568 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6684539318084717, + "epoch": 9.78, + "learning_rate": 1.2116089039165963e-06, + "loss": 0.5773, + "step": 11572, + "task_loss": 0.9324235320091248 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.35391858220100403, + "epoch": 9.78, + "learning_rate": 1.206912745374284e-06, + "loss": 0.3707, + "step": 11573, + "task_loss": 0.3308321237564087 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4062829613685608, + "epoch": 9.78, + "learning_rate": 1.2022165868319716e-06, + "loss": 0.3745, + "step": 11574, + "task_loss": 0.7701423764228821 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6568615436553955, + "epoch": 9.78, + "learning_rate": 1.1975204282896592e-06, + "loss": 0.547, + "step": 11575, + "task_loss": 0.6077378392219543 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6105924844741821, + "epoch": 9.78, + "learning_rate": 1.1928242697473468e-06, + "loss": 0.5293, + "step": 11576, + "task_loss": 1.178383231163025 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3055509328842163, + "epoch": 9.79, + "learning_rate": 1.1881281112050344e-06, + "loss": 0.4596, + "step": 11577, + "task_loss": 0.5228952169418335 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5921518802642822, + "epoch": 9.79, + "learning_rate": 1.183431952662722e-06, + "loss": 0.5094, + "step": 11578, + "task_loss": 0.2918110191822052 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.40331828594207764, + "epoch": 9.79, + "learning_rate": 1.1787357941204096e-06, + "loss": 0.4365, + "step": 11579, + "task_loss": 0.49102702736854553 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.717434287071228, + "epoch": 9.79, + "learning_rate": 1.1740396355780972e-06, + "loss": 0.5516, + "step": 11580, + "task_loss": 0.9940044283866882 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3069460988044739, + "epoch": 9.79, + "learning_rate": 1.1693434770357848e-06, + "loss": 0.4929, + "step": 11581, + "task_loss": 0.3099232017993927 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8899157047271729, + "epoch": 9.79, + "learning_rate": 1.1646473184934725e-06, + "loss": 0.5714, + "step": 11582, + "task_loss": 1.061488151550293 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.33348381519317627, + "epoch": 9.79, + "learning_rate": 1.15995115995116e-06, + "loss": 0.6079, + "step": 11583, + "task_loss": 0.09759016335010529 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.423615038394928, + "epoch": 9.79, + "learning_rate": 1.1552550014088477e-06, + "loss": 0.4984, + "step": 11584, + "task_loss": 0.8607199788093567 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.515560507774353, + "epoch": 9.79, + "learning_rate": 1.1505588428665353e-06, + "loss": 0.5499, + "step": 11585, + "task_loss": 0.5212283134460449 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5573514699935913, + "epoch": 9.79, + "learning_rate": 1.1458626843242229e-06, + "loss": 0.5308, + "step": 11586, + "task_loss": 0.6055467128753662 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.46745622158050537, + "epoch": 9.79, + "learning_rate": 1.1411665257819105e-06, + "loss": 0.5789, + "step": 11587, + "task_loss": 1.0636824369430542 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.47048380970954895, + "epoch": 9.79, + "learning_rate": 1.136470367239598e-06, + "loss": 0.5039, + "step": 11588, + "task_loss": 0.31840845942497253 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6769927740097046, + "epoch": 9.8, + "learning_rate": 1.1317742086972857e-06, + "loss": 0.5405, + "step": 11589, + "task_loss": 0.6843545436859131 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9910627603530884, + "epoch": 9.8, + "learning_rate": 1.1270780501549731e-06, + "loss": 0.6836, + "step": 11590, + "task_loss": 0.8024782538414001 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5615391135215759, + "epoch": 9.8, + "learning_rate": 1.122381891612661e-06, + "loss": 0.6264, + "step": 11591, + "task_loss": 0.3703474998474121 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.33381354808807373, + "epoch": 9.8, + "learning_rate": 1.1176857330703483e-06, + "loss": 0.516, + "step": 11592, + "task_loss": 0.6174180507659912 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.27954569458961487, + "epoch": 9.8, + "learning_rate": 1.1129895745280362e-06, + "loss": 0.4798, + "step": 11593, + "task_loss": 0.4783615171909332 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6030651330947876, + "epoch": 9.8, + "learning_rate": 1.1082934159857236e-06, + "loss": 0.6021, + "step": 11594, + "task_loss": 1.0057975053787231 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.41649362444877625, + "epoch": 9.8, + "learning_rate": 1.1035972574434114e-06, + "loss": 0.4583, + "step": 11595, + "task_loss": 0.3739801049232483 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.27364665269851685, + "epoch": 9.8, + "learning_rate": 1.098901098901099e-06, + "loss": 0.5547, + "step": 11596, + "task_loss": 1.8313324451446533 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3823930621147156, + "epoch": 9.8, + "learning_rate": 1.0942049403587866e-06, + "loss": 0.4341, + "step": 11597, + "task_loss": 0.6088560223579407 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.49057888984680176, + "epoch": 9.8, + "learning_rate": 1.0895087818164742e-06, + "loss": 0.6453, + "step": 11598, + "task_loss": 0.9247868657112122 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4640599489212036, + "epoch": 9.8, + "learning_rate": 1.0848126232741618e-06, + "loss": 0.5071, + "step": 11599, + "task_loss": 0.5801272988319397 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.33312439918518066, + "epoch": 9.81, + "learning_rate": 1.0801164647318494e-06, + "loss": 0.501, + "step": 11600, + "task_loss": 0.12407442927360535 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4253785014152527, + "epoch": 9.81, + "learning_rate": 1.075420306189537e-06, + "loss": 0.5372, + "step": 11601, + "task_loss": 0.5787076950073242 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.668445348739624, + "epoch": 9.81, + "learning_rate": 1.0707241476472247e-06, + "loss": 0.5008, + "step": 11602, + "task_loss": 0.1661590188741684 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9472905397415161, + "epoch": 9.81, + "learning_rate": 1.0660279891049123e-06, + "loss": 0.5975, + "step": 11603, + "task_loss": 0.24609535932540894 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3680833578109741, + "epoch": 9.81, + "learning_rate": 1.0613318305625999e-06, + "loss": 0.4816, + "step": 11604, + "task_loss": 0.7102903127670288 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.29346439242362976, + "epoch": 9.81, + "learning_rate": 1.0566356720202875e-06, + "loss": 0.503, + "step": 11605, + "task_loss": 0.2607310116291046 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4590504765510559, + "epoch": 9.81, + "learning_rate": 1.051939513477975e-06, + "loss": 0.5866, + "step": 11606, + "task_loss": 0.5955806374549866 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.25720179080963135, + "epoch": 9.81, + "learning_rate": 1.0472433549356627e-06, + "loss": 0.525, + "step": 11607, + "task_loss": 0.5590839385986328 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6122144460678101, + "epoch": 9.81, + "learning_rate": 1.0425471963933503e-06, + "loss": 0.6745, + "step": 11608, + "task_loss": 0.4045354425907135 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.25508996844291687, + "epoch": 9.81, + "learning_rate": 1.037851037851038e-06, + "loss": 0.5522, + "step": 11609, + "task_loss": 0.32204800844192505 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6577528119087219, + "epoch": 9.81, + "learning_rate": 1.0331548793087255e-06, + "loss": 0.5495, + "step": 11610, + "task_loss": 0.7620530724525452 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3551430106163025, + "epoch": 9.81, + "learning_rate": 1.0284587207664131e-06, + "loss": 0.3791, + "step": 11611, + "task_loss": 0.42685604095458984 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.25118494033813477, + "epoch": 9.82, + "learning_rate": 1.0237625622241008e-06, + "loss": 0.5701, + "step": 11612, + "task_loss": 0.15834520757198334 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.49000081419944763, + "epoch": 9.82, + "learning_rate": 1.0190664036817884e-06, + "loss": 0.46, + "step": 11613, + "task_loss": 0.3670029938220978 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5317562818527222, + "epoch": 9.82, + "learning_rate": 1.014370245139476e-06, + "loss": 0.5727, + "step": 11614, + "task_loss": 1.1310384273529053 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5481013059616089, + "epoch": 9.82, + "learning_rate": 1.0096740865971636e-06, + "loss": 0.5879, + "step": 11615, + "task_loss": 0.6628435254096985 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.38540834188461304, + "epoch": 9.82, + "learning_rate": 1.0049779280548512e-06, + "loss": 0.5063, + "step": 11616, + "task_loss": 0.7893949747085571 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.2162359356880188, + "epoch": 9.82, + "learning_rate": 1.0002817695125388e-06, + "loss": 0.3939, + "step": 11617, + "task_loss": 0.08303603529930115 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.43777918815612793, + "epoch": 9.82, + "learning_rate": 9.955856109702264e-07, + "loss": 0.4387, + "step": 11618, + "task_loss": 0.6692999601364136 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5562664270401001, + "epoch": 9.82, + "learning_rate": 9.90889452427914e-07, + "loss": 0.4284, + "step": 11619, + "task_loss": 0.626455545425415 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3915444016456604, + "epoch": 9.82, + "learning_rate": 9.861932938856016e-07, + "loss": 0.5015, + "step": 11620, + "task_loss": 1.145024299621582 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4873444437980652, + "epoch": 9.82, + "learning_rate": 9.814971353432892e-07, + "loss": 0.6575, + "step": 11621, + "task_loss": 0.5350620746612549 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6686731576919556, + "epoch": 9.82, + "learning_rate": 9.768009768009769e-07, + "loss": 0.6264, + "step": 11622, + "task_loss": 0.5422884225845337 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.33861276507377625, + "epoch": 9.82, + "learning_rate": 9.721048182586645e-07, + "loss": 0.497, + "step": 11623, + "task_loss": 0.22767645120620728 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.701568603515625, + "epoch": 9.83, + "learning_rate": 9.67408659716352e-07, + "loss": 0.4998, + "step": 11624, + "task_loss": 1.2405941486358643 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.2596818804740906, + "epoch": 9.83, + "learning_rate": 9.627125011740397e-07, + "loss": 0.5146, + "step": 11625, + "task_loss": 0.18982334434986115 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5603885650634766, + "epoch": 9.83, + "learning_rate": 9.580163426317273e-07, + "loss": 0.5939, + "step": 11626, + "task_loss": 1.038058876991272 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4759581685066223, + "epoch": 9.83, + "learning_rate": 9.53320184089415e-07, + "loss": 0.4709, + "step": 11627, + "task_loss": 0.8012621998786926 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.43395036458969116, + "epoch": 9.83, + "learning_rate": 9.486240255471025e-07, + "loss": 0.5026, + "step": 11628, + "task_loss": 1.1749374866485596 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.38206666707992554, + "epoch": 9.83, + "learning_rate": 9.439278670047902e-07, + "loss": 0.4065, + "step": 11629, + "task_loss": 1.007368564605713 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4643164873123169, + "epoch": 9.83, + "learning_rate": 9.392317084624777e-07, + "loss": 0.5459, + "step": 11630, + "task_loss": 0.6300928592681885 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5903993844985962, + "epoch": 9.83, + "learning_rate": 9.345355499201655e-07, + "loss": 0.5369, + "step": 11631, + "task_loss": 0.4379362463951111 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7203224301338196, + "epoch": 9.83, + "learning_rate": 9.29839391377853e-07, + "loss": 0.5348, + "step": 11632, + "task_loss": 0.5422298312187195 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5331912040710449, + "epoch": 9.83, + "learning_rate": 9.251432328355406e-07, + "loss": 0.3797, + "step": 11633, + "task_loss": 0.36403870582580566 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.49985480308532715, + "epoch": 9.83, + "learning_rate": 9.204470742932282e-07, + "loss": 0.4733, + "step": 11634, + "task_loss": 0.2338169813156128 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4326721429824829, + "epoch": 9.83, + "learning_rate": 9.157509157509158e-07, + "loss": 0.4679, + "step": 11635, + "task_loss": 0.07940167933702469 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5828026533126831, + "epoch": 9.84, + "learning_rate": 9.110547572086034e-07, + "loss": 0.6053, + "step": 11636, + "task_loss": 0.779792845249176 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.47358018159866333, + "epoch": 9.84, + "learning_rate": 9.06358598666291e-07, + "loss": 0.3773, + "step": 11637, + "task_loss": 1.2171730995178223 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7970803380012512, + "epoch": 9.84, + "learning_rate": 9.016624401239786e-07, + "loss": 0.481, + "step": 11638, + "task_loss": 0.8441946506500244 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.233774796128273, + "epoch": 9.84, + "learning_rate": 8.969662815816662e-07, + "loss": 0.4071, + "step": 11639, + "task_loss": 0.18194571137428284 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.596062183380127, + "epoch": 9.84, + "learning_rate": 8.922701230393537e-07, + "loss": 0.4433, + "step": 11640, + "task_loss": 0.6692358255386353 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5080369114875793, + "epoch": 9.84, + "learning_rate": 8.875739644970415e-07, + "loss": 0.5405, + "step": 11641, + "task_loss": 0.6118054986000061 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4059675335884094, + "epoch": 9.84, + "learning_rate": 8.82877805954729e-07, + "loss": 0.5871, + "step": 11642, + "task_loss": 0.22557257115840912 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.32986581325531006, + "epoch": 9.84, + "learning_rate": 8.781816474124167e-07, + "loss": 0.4492, + "step": 11643, + "task_loss": 0.1751708984375 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5777475237846375, + "epoch": 9.84, + "learning_rate": 8.734854888701042e-07, + "loss": 0.4186, + "step": 11644, + "task_loss": 0.741260826587677 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5399770736694336, + "epoch": 9.84, + "learning_rate": 8.687893303277919e-07, + "loss": 0.4496, + "step": 11645, + "task_loss": 1.0818626880645752 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6499859690666199, + "epoch": 9.84, + "learning_rate": 8.640931717854794e-07, + "loss": 0.7451, + "step": 11646, + "task_loss": 0.6631431579589844 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9022203683853149, + "epoch": 9.84, + "learning_rate": 8.593970132431671e-07, + "loss": 0.5803, + "step": 11647, + "task_loss": 0.7155879735946655 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4263777732849121, + "epoch": 9.85, + "learning_rate": 8.547008547008548e-07, + "loss": 0.3696, + "step": 11648, + "task_loss": 0.24299080669879913 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.34213414788246155, + "epoch": 9.85, + "learning_rate": 8.500046961585423e-07, + "loss": 0.5425, + "step": 11649, + "task_loss": 0.3378290832042694 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3540082573890686, + "epoch": 9.85, + "learning_rate": 8.4530853761623e-07, + "loss": 0.4214, + "step": 11650, + "task_loss": 1.3626933097839355 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5542054772377014, + "epoch": 9.85, + "learning_rate": 8.406123790739176e-07, + "loss": 0.4635, + "step": 11651, + "task_loss": 0.9357801675796509 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.37362322211265564, + "epoch": 9.85, + "learning_rate": 8.359162205316053e-07, + "loss": 0.3824, + "step": 11652, + "task_loss": 1.2620269060134888 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6783081293106079, + "epoch": 9.85, + "learning_rate": 8.312200619892928e-07, + "loss": 0.6272, + "step": 11653, + "task_loss": 0.5918532609939575 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4210309386253357, + "epoch": 9.85, + "learning_rate": 8.265239034469805e-07, + "loss": 0.5351, + "step": 11654, + "task_loss": 1.0818651914596558 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5427195429801941, + "epoch": 9.85, + "learning_rate": 8.21827744904668e-07, + "loss": 0.4965, + "step": 11655, + "task_loss": 0.5718876123428345 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7072763442993164, + "epoch": 9.85, + "learning_rate": 8.171315863623557e-07, + "loss": 0.5062, + "step": 11656, + "task_loss": 1.2994518280029297 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.603376030921936, + "epoch": 9.85, + "learning_rate": 8.124354278200432e-07, + "loss": 0.6182, + "step": 11657, + "task_loss": 0.3070209324359894 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6007376909255981, + "epoch": 9.85, + "learning_rate": 8.077392692777309e-07, + "loss": 0.5859, + "step": 11658, + "task_loss": 0.7234011292457581 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4592774510383606, + "epoch": 9.85, + "learning_rate": 8.030431107354184e-07, + "loss": 0.5964, + "step": 11659, + "task_loss": 0.8880534768104553 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.716975748538971, + "epoch": 9.86, + "learning_rate": 7.983469521931062e-07, + "loss": 0.6425, + "step": 11660, + "task_loss": 0.7926874160766602 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8078975677490234, + "epoch": 9.86, + "learning_rate": 7.936507936507937e-07, + "loss": 0.5674, + "step": 11661, + "task_loss": 1.9866645336151123 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.702674150466919, + "epoch": 9.86, + "learning_rate": 7.889546351084814e-07, + "loss": 0.4995, + "step": 11662, + "task_loss": 0.5355736613273621 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6361211538314819, + "epoch": 9.86, + "learning_rate": 7.842584765661689e-07, + "loss": 0.5624, + "step": 11663, + "task_loss": 1.071346402168274 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5005534887313843, + "epoch": 9.86, + "learning_rate": 7.795623180238566e-07, + "loss": 0.5035, + "step": 11664, + "task_loss": 0.9089404940605164 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.39026057720184326, + "epoch": 9.86, + "learning_rate": 7.748661594815442e-07, + "loss": 0.5517, + "step": 11665, + "task_loss": 0.2000344693660736 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.40136778354644775, + "epoch": 9.86, + "learning_rate": 7.701700009392318e-07, + "loss": 0.3921, + "step": 11666, + "task_loss": 1.0477348566055298 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9167822599411011, + "epoch": 9.86, + "learning_rate": 7.654738423969194e-07, + "loss": 0.6566, + "step": 11667, + "task_loss": 1.1802221536636353 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.40343695878982544, + "epoch": 9.86, + "learning_rate": 7.60777683854607e-07, + "loss": 0.5839, + "step": 11668, + "task_loss": 0.8019670844078064 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5909879207611084, + "epoch": 9.86, + "learning_rate": 7.560815253122946e-07, + "loss": 0.4888, + "step": 11669, + "task_loss": 0.8121470212936401 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5047805309295654, + "epoch": 9.86, + "learning_rate": 7.513853667699822e-07, + "loss": 0.5763, + "step": 11670, + "task_loss": 0.42404186725616455 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6522117853164673, + "epoch": 9.87, + "learning_rate": 7.466892082276698e-07, + "loss": 0.5742, + "step": 11671, + "task_loss": 0.19312548637390137 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6542204022407532, + "epoch": 9.87, + "learning_rate": 7.419930496853574e-07, + "loss": 0.5004, + "step": 11672, + "task_loss": 0.8001068234443665 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5042328834533691, + "epoch": 9.87, + "learning_rate": 7.37296891143045e-07, + "loss": 0.5101, + "step": 11673, + "task_loss": 1.1608223915100098 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.713342010974884, + "epoch": 9.87, + "learning_rate": 7.326007326007326e-07, + "loss": 0.6223, + "step": 11674, + "task_loss": 1.5277520418167114 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7675071358680725, + "epoch": 9.87, + "learning_rate": 7.279045740584202e-07, + "loss": 0.6392, + "step": 11675, + "task_loss": 1.5556610822677612 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7275183200836182, + "epoch": 9.87, + "learning_rate": 7.232084155161078e-07, + "loss": 0.7833, + "step": 11676, + "task_loss": 1.013778805732727 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.20145830512046814, + "epoch": 9.87, + "learning_rate": 7.185122569737954e-07, + "loss": 0.6169, + "step": 11677, + "task_loss": 0.10084031522274017 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6145246624946594, + "epoch": 9.87, + "learning_rate": 7.13816098431483e-07, + "loss": 0.473, + "step": 11678, + "task_loss": 0.5278776288032532 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.46959272027015686, + "epoch": 9.87, + "learning_rate": 7.091199398891706e-07, + "loss": 0.5969, + "step": 11679, + "task_loss": 0.8945805430412292 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.36883777379989624, + "epoch": 9.87, + "learning_rate": 7.044237813468583e-07, + "loss": 0.5775, + "step": 11680, + "task_loss": 0.5703630447387695 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6621829867362976, + "epoch": 9.87, + "learning_rate": 6.997276228045459e-07, + "loss": 0.5539, + "step": 11681, + "task_loss": 0.19010132551193237 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.28977853059768677, + "epoch": 9.87, + "learning_rate": 6.950314642622335e-07, + "loss": 0.4927, + "step": 11682, + "task_loss": 0.059441763907670975 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5329312086105347, + "epoch": 9.88, + "learning_rate": 6.903353057199211e-07, + "loss": 0.5686, + "step": 11683, + "task_loss": 0.6747649908065796 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6110724210739136, + "epoch": 9.88, + "learning_rate": 6.856391471776087e-07, + "loss": 0.4652, + "step": 11684, + "task_loss": 0.7761861085891724 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5337547063827515, + "epoch": 9.88, + "learning_rate": 6.809429886352963e-07, + "loss": 0.6755, + "step": 11685, + "task_loss": 1.0520586967468262 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.69350266456604, + "epoch": 9.88, + "learning_rate": 6.762468300929839e-07, + "loss": 0.485, + "step": 11686, + "task_loss": 0.4552392065525055 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.45390385389328003, + "epoch": 9.88, + "learning_rate": 6.715506715506716e-07, + "loss": 0.5791, + "step": 11687, + "task_loss": 0.22169362008571625 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5429032444953918, + "epoch": 9.88, + "learning_rate": 6.668545130083592e-07, + "loss": 0.4533, + "step": 11688, + "task_loss": 1.0706154108047485 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5955007672309875, + "epoch": 9.88, + "learning_rate": 6.621583544660469e-07, + "loss": 0.5626, + "step": 11689, + "task_loss": 0.730311393737793 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.1659528911113739, + "epoch": 9.88, + "learning_rate": 6.574621959237345e-07, + "loss": 0.4902, + "step": 11690, + "task_loss": 0.09856458008289337 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6393058896064758, + "epoch": 9.88, + "learning_rate": 6.527660373814221e-07, + "loss": 0.4387, + "step": 11691, + "task_loss": 1.0841938257217407 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.38988596200942993, + "epoch": 9.88, + "learning_rate": 6.480698788391097e-07, + "loss": 0.6805, + "step": 11692, + "task_loss": 1.0809043645858765 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4136083424091339, + "epoch": 9.88, + "learning_rate": 6.433737202967973e-07, + "loss": 0.5176, + "step": 11693, + "task_loss": 0.4464772939682007 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3602759540081024, + "epoch": 9.88, + "learning_rate": 6.386775617544849e-07, + "loss": 0.4603, + "step": 11694, + "task_loss": 0.8915525674819946 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4933736026287079, + "epoch": 9.89, + "learning_rate": 6.339814032121725e-07, + "loss": 0.5011, + "step": 11695, + "task_loss": 0.20792953670024872 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7183307409286499, + "epoch": 9.89, + "learning_rate": 6.292852446698601e-07, + "loss": 0.537, + "step": 11696, + "task_loss": 0.6770106554031372 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6647278666496277, + "epoch": 9.89, + "learning_rate": 6.245890861275477e-07, + "loss": 0.4902, + "step": 11697, + "task_loss": 0.4130721092224121 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.567448616027832, + "epoch": 9.89, + "learning_rate": 6.198929275852353e-07, + "loss": 0.6651, + "step": 11698, + "task_loss": 0.9031826853752136 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5885971784591675, + "epoch": 9.89, + "learning_rate": 6.15196769042923e-07, + "loss": 0.4846, + "step": 11699, + "task_loss": 0.6086427569389343 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4790969491004944, + "epoch": 9.89, + "learning_rate": 6.105006105006106e-07, + "loss": 0.4786, + "step": 11700, + "task_loss": 0.6837748289108276 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.2886730432510376, + "epoch": 9.89, + "learning_rate": 6.058044519582982e-07, + "loss": 0.3612, + "step": 11701, + "task_loss": 1.1823729276657104 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6294911503791809, + "epoch": 9.89, + "learning_rate": 6.011082934159858e-07, + "loss": 0.4841, + "step": 11702, + "task_loss": 0.4175061285495758 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5599687099456787, + "epoch": 9.89, + "learning_rate": 5.964121348736734e-07, + "loss": 0.5953, + "step": 11703, + "task_loss": 0.9556514024734497 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8798742890357971, + "epoch": 9.89, + "learning_rate": 5.91715976331361e-07, + "loss": 0.6146, + "step": 11704, + "task_loss": 1.3099777698516846 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8467668294906616, + "epoch": 9.89, + "learning_rate": 5.870198177890486e-07, + "loss": 0.8594, + "step": 11705, + "task_loss": 0.611789345741272 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.40101730823516846, + "epoch": 9.89, + "learning_rate": 5.823236592467362e-07, + "loss": 0.5325, + "step": 11706, + "task_loss": 0.07936819642782211 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6615309119224548, + "epoch": 9.9, + "learning_rate": 5.776275007044238e-07, + "loss": 0.5374, + "step": 11707, + "task_loss": 0.6743787527084351 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5921816825866699, + "epoch": 9.9, + "learning_rate": 5.729313421621114e-07, + "loss": 0.7002, + "step": 11708, + "task_loss": 1.563610553741455 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.38395875692367554, + "epoch": 9.9, + "learning_rate": 5.68235183619799e-07, + "loss": 0.511, + "step": 11709, + "task_loss": 0.3957885801792145 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5545816421508789, + "epoch": 9.9, + "learning_rate": 5.635390250774866e-07, + "loss": 0.7129, + "step": 11710, + "task_loss": 0.9234458208084106 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.47973838448524475, + "epoch": 9.9, + "learning_rate": 5.588428665351742e-07, + "loss": 0.5939, + "step": 11711, + "task_loss": 0.9120413064956665 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.41496413946151733, + "epoch": 9.9, + "learning_rate": 5.541467079928618e-07, + "loss": 0.4431, + "step": 11712, + "task_loss": 0.6001289486885071 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7490970492362976, + "epoch": 9.9, + "learning_rate": 5.494505494505495e-07, + "loss": 0.7053, + "step": 11713, + "task_loss": 0.42996713519096375 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4524564743041992, + "epoch": 9.9, + "learning_rate": 5.447543909082371e-07, + "loss": 0.4594, + "step": 11714, + "task_loss": 0.8817393779754639 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.515659749507904, + "epoch": 9.9, + "learning_rate": 5.400582323659247e-07, + "loss": 0.5946, + "step": 11715, + "task_loss": 0.8906511664390564 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4314442276954651, + "epoch": 9.9, + "learning_rate": 5.353620738236123e-07, + "loss": 0.4708, + "step": 11716, + "task_loss": 0.9123784899711609 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.46263352036476135, + "epoch": 9.9, + "learning_rate": 5.306659152812999e-07, + "loss": 0.7295, + "step": 11717, + "task_loss": 0.8897507786750793 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6467429399490356, + "epoch": 9.9, + "learning_rate": 5.259697567389875e-07, + "loss": 0.4722, + "step": 11718, + "task_loss": 1.1356202363967896 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.658686637878418, + "epoch": 9.91, + "learning_rate": 5.212735981966752e-07, + "loss": 0.6096, + "step": 11719, + "task_loss": 0.6746451258659363 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6877976655960083, + "epoch": 9.91, + "learning_rate": 5.165774396543628e-07, + "loss": 0.5476, + "step": 11720, + "task_loss": 1.2474610805511475 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.36353200674057007, + "epoch": 9.91, + "learning_rate": 5.118812811120504e-07, + "loss": 0.4975, + "step": 11721, + "task_loss": 0.9711368680000305 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.45902496576309204, + "epoch": 9.91, + "learning_rate": 5.07185122569738e-07, + "loss": 0.5595, + "step": 11722, + "task_loss": 1.4435758590698242 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3983616232872009, + "epoch": 9.91, + "learning_rate": 5.024889640274256e-07, + "loss": 0.5831, + "step": 11723, + "task_loss": 0.23008567094802856 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4096594452857971, + "epoch": 9.91, + "learning_rate": 4.977928054851132e-07, + "loss": 0.602, + "step": 11724, + "task_loss": 0.9441166520118713 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6864545345306396, + "epoch": 9.91, + "learning_rate": 4.930966469428008e-07, + "loss": 0.5048, + "step": 11725, + "task_loss": 0.36237865686416626 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3644891083240509, + "epoch": 9.91, + "learning_rate": 4.884004884004884e-07, + "loss": 0.4523, + "step": 11726, + "task_loss": 0.37528926134109497 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7014000415802002, + "epoch": 9.91, + "learning_rate": 4.83704329858176e-07, + "loss": 0.5005, + "step": 11727, + "task_loss": 0.7368527054786682 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.39181607961654663, + "epoch": 9.91, + "learning_rate": 4.790081713158637e-07, + "loss": 0.4966, + "step": 11728, + "task_loss": 1.5060133934020996 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6878873109817505, + "epoch": 9.91, + "learning_rate": 4.7431201277355126e-07, + "loss": 0.6024, + "step": 11729, + "task_loss": 0.7393627762794495 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4629337191581726, + "epoch": 9.91, + "learning_rate": 4.6961585423123887e-07, + "loss": 0.5482, + "step": 11730, + "task_loss": 1.063454270362854 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5009876489639282, + "epoch": 9.92, + "learning_rate": 4.649196956889265e-07, + "loss": 0.7925, + "step": 11731, + "task_loss": 0.6962403655052185 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8067636489868164, + "epoch": 9.92, + "learning_rate": 4.602235371466141e-07, + "loss": 0.528, + "step": 11732, + "task_loss": 1.8258051872253418 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7342110276222229, + "epoch": 9.92, + "learning_rate": 4.555273786043017e-07, + "loss": 0.6111, + "step": 11733, + "task_loss": 0.9588983654975891 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4452546238899231, + "epoch": 9.92, + "learning_rate": 4.508312200619893e-07, + "loss": 0.5633, + "step": 11734, + "task_loss": 0.6177554726600647 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6966890096664429, + "epoch": 9.92, + "learning_rate": 4.4613506151967687e-07, + "loss": 0.5864, + "step": 11735, + "task_loss": 0.523197591304779 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3445895314216614, + "epoch": 9.92, + "learning_rate": 4.414389029773645e-07, + "loss": 0.5029, + "step": 11736, + "task_loss": 0.14446979761123657 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.648215651512146, + "epoch": 9.92, + "learning_rate": 4.367427444350521e-07, + "loss": 0.6209, + "step": 11737, + "task_loss": 0.6089484095573425 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.694189190864563, + "epoch": 9.92, + "learning_rate": 4.320465858927397e-07, + "loss": 0.6191, + "step": 11738, + "task_loss": 0.828980565071106 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.40063929557800293, + "epoch": 9.92, + "learning_rate": 4.273504273504274e-07, + "loss": 0.5578, + "step": 11739, + "task_loss": 0.7841668725013733 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.34507930278778076, + "epoch": 9.92, + "learning_rate": 4.22654268808115e-07, + "loss": 0.6069, + "step": 11740, + "task_loss": 0.515508770942688 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5274267196655273, + "epoch": 9.92, + "learning_rate": 4.1795811026580264e-07, + "loss": 0.4529, + "step": 11741, + "task_loss": 0.4202560484409332 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.567223846912384, + "epoch": 9.93, + "learning_rate": 4.1326195172349025e-07, + "loss": 0.5515, + "step": 11742, + "task_loss": 0.7392473220825195 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.38944780826568604, + "epoch": 9.93, + "learning_rate": 4.0856579318117786e-07, + "loss": 0.5518, + "step": 11743, + "task_loss": 0.721247673034668 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7464834451675415, + "epoch": 9.93, + "learning_rate": 4.0386963463886547e-07, + "loss": 0.6989, + "step": 11744, + "task_loss": 0.30686262249946594 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7815307378768921, + "epoch": 9.93, + "learning_rate": 3.991734760965531e-07, + "loss": 0.659, + "step": 11745, + "task_loss": 0.9529940485954285 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4590678811073303, + "epoch": 9.93, + "learning_rate": 3.944773175542407e-07, + "loss": 0.5874, + "step": 11746, + "task_loss": 1.4956971406936646 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.9558858275413513, + "epoch": 9.93, + "learning_rate": 3.897811590119283e-07, + "loss": 0.6184, + "step": 11747, + "task_loss": 0.7298904061317444 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.37069857120513916, + "epoch": 9.93, + "learning_rate": 3.850850004696159e-07, + "loss": 0.6279, + "step": 11748, + "task_loss": 1.27947199344635 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.569645881652832, + "epoch": 9.93, + "learning_rate": 3.803888419273035e-07, + "loss": 0.4203, + "step": 11749, + "task_loss": 1.2289179563522339 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5442161560058594, + "epoch": 9.93, + "learning_rate": 3.756926833849911e-07, + "loss": 0.5799, + "step": 11750, + "task_loss": 1.014480710029602 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3223392367362976, + "epoch": 9.93, + "learning_rate": 3.709965248426787e-07, + "loss": 0.461, + "step": 11751, + "task_loss": 1.4547274112701416 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.31352534890174866, + "epoch": 9.93, + "learning_rate": 3.663003663003663e-07, + "loss": 0.618, + "step": 11752, + "task_loss": 0.13427120447158813 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7314385175704956, + "epoch": 9.93, + "learning_rate": 3.616042077580539e-07, + "loss": 0.528, + "step": 11753, + "task_loss": 1.0424449443817139 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3079354763031006, + "epoch": 9.94, + "learning_rate": 3.569080492157415e-07, + "loss": 0.4352, + "step": 11754, + "task_loss": 0.17069625854492188 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.30831047892570496, + "epoch": 9.94, + "learning_rate": 3.5221189067342913e-07, + "loss": 0.4185, + "step": 11755, + "task_loss": 0.17191042006015778 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5566548109054565, + "epoch": 9.94, + "learning_rate": 3.4751573213111674e-07, + "loss": 0.5972, + "step": 11756, + "task_loss": 0.35748881101608276 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.2582440972328186, + "epoch": 9.94, + "learning_rate": 3.4281957358880435e-07, + "loss": 0.3549, + "step": 11757, + "task_loss": 0.09129612892866135 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6168367862701416, + "epoch": 9.94, + "learning_rate": 3.3812341504649196e-07, + "loss": 0.6388, + "step": 11758, + "task_loss": 0.5061416029930115 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5181525945663452, + "epoch": 9.94, + "learning_rate": 3.334272565041796e-07, + "loss": 0.4745, + "step": 11759, + "task_loss": 0.48060616850852966 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.41481879353523254, + "epoch": 9.94, + "learning_rate": 3.2873109796186723e-07, + "loss": 0.4473, + "step": 11760, + "task_loss": 0.23127619922161102 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4852718710899353, + "epoch": 9.94, + "learning_rate": 3.2403493941955484e-07, + "loss": 0.5705, + "step": 11761, + "task_loss": 0.2514514923095703 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6261858940124512, + "epoch": 9.94, + "learning_rate": 3.1933878087724245e-07, + "loss": 0.5049, + "step": 11762, + "task_loss": 0.3908676505088806 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5693542957305908, + "epoch": 9.94, + "learning_rate": 3.1464262233493006e-07, + "loss": 0.531, + "step": 11763, + "task_loss": 0.5728221535682678 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4909439980983734, + "epoch": 9.94, + "learning_rate": 3.0994646379261767e-07, + "loss": 0.6085, + "step": 11764, + "task_loss": 0.5773086547851562 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5992412567138672, + "epoch": 9.94, + "learning_rate": 3.052503052503053e-07, + "loss": 0.5379, + "step": 11765, + "task_loss": 0.4260658025741577 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.36398622393608093, + "epoch": 9.95, + "learning_rate": 3.005541467079929e-07, + "loss": 0.3985, + "step": 11766, + "task_loss": 0.23804312944412231 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.44097012281417847, + "epoch": 9.95, + "learning_rate": 2.958579881656805e-07, + "loss": 0.3858, + "step": 11767, + "task_loss": 0.17028015851974487 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.339007705450058, + "epoch": 9.95, + "learning_rate": 2.911618296233681e-07, + "loss": 0.4751, + "step": 11768, + "task_loss": 0.1849353164434433 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.32081252336502075, + "epoch": 9.95, + "learning_rate": 2.864656710810557e-07, + "loss": 0.4706, + "step": 11769, + "task_loss": 0.2316647171974182 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5445622205734253, + "epoch": 9.95, + "learning_rate": 2.817695125387433e-07, + "loss": 0.5206, + "step": 11770, + "task_loss": 0.3381626307964325 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8521730899810791, + "epoch": 9.95, + "learning_rate": 2.770733539964309e-07, + "loss": 0.5178, + "step": 11771, + "task_loss": 1.0700209140777588 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4060363173484802, + "epoch": 9.95, + "learning_rate": 2.7237719545411855e-07, + "loss": 0.466, + "step": 11772, + "task_loss": 1.5562776327133179 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.48679983615875244, + "epoch": 9.95, + "learning_rate": 2.6768103691180616e-07, + "loss": 0.552, + "step": 11773, + "task_loss": 0.06236816942691803 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7523693442344666, + "epoch": 9.95, + "learning_rate": 2.629848783694938e-07, + "loss": 0.5166, + "step": 11774, + "task_loss": 1.7538937330245972 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3473876118659973, + "epoch": 9.95, + "learning_rate": 2.582887198271814e-07, + "loss": 0.5965, + "step": 11775, + "task_loss": 1.2157716751098633 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.41753390431404114, + "epoch": 9.95, + "learning_rate": 2.53592561284869e-07, + "loss": 0.4986, + "step": 11776, + "task_loss": 0.9441444277763367 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.45573607087135315, + "epoch": 9.95, + "learning_rate": 2.488964027425566e-07, + "loss": 0.5765, + "step": 11777, + "task_loss": 0.7261573672294617 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6504294872283936, + "epoch": 9.96, + "learning_rate": 2.442002442002442e-07, + "loss": 0.6486, + "step": 11778, + "task_loss": 1.0529546737670898 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.2786175310611725, + "epoch": 9.96, + "learning_rate": 2.395040856579318e-07, + "loss": 0.5115, + "step": 11779, + "task_loss": 0.6888869404792786 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.48786431550979614, + "epoch": 9.96, + "learning_rate": 2.3480792711561944e-07, + "loss": 0.5833, + "step": 11780, + "task_loss": 0.7851517200469971 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.47927629947662354, + "epoch": 9.96, + "learning_rate": 2.3011176857330705e-07, + "loss": 0.4253, + "step": 11781, + "task_loss": 0.36839035153388977 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.20583805441856384, + "epoch": 9.96, + "learning_rate": 2.2541561003099466e-07, + "loss": 0.4041, + "step": 11782, + "task_loss": 0.3866371512413025 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5634337663650513, + "epoch": 9.96, + "learning_rate": 2.2071945148868224e-07, + "loss": 0.5032, + "step": 11783, + "task_loss": 0.6320585608482361 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.35720694065093994, + "epoch": 9.96, + "learning_rate": 2.1602329294636985e-07, + "loss": 0.5118, + "step": 11784, + "task_loss": 0.41417720913887024 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5381942987442017, + "epoch": 9.96, + "learning_rate": 2.113271344040575e-07, + "loss": 0.721, + "step": 11785, + "task_loss": 0.6323438882827759 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4419476389884949, + "epoch": 9.96, + "learning_rate": 2.0663097586174512e-07, + "loss": 0.4427, + "step": 11786, + "task_loss": 0.07649518549442291 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6069304943084717, + "epoch": 9.96, + "learning_rate": 2.0193481731943273e-07, + "loss": 0.4929, + "step": 11787, + "task_loss": 0.2779400050640106 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.2164638489484787, + "epoch": 9.96, + "learning_rate": 1.9723865877712034e-07, + "loss": 0.4137, + "step": 11788, + "task_loss": 0.6758087873458862 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6397323608398438, + "epoch": 9.96, + "learning_rate": 1.9254250023480795e-07, + "loss": 0.4469, + "step": 11789, + "task_loss": 0.3655050992965698 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3389008045196533, + "epoch": 9.97, + "learning_rate": 1.8784634169249554e-07, + "loss": 0.4653, + "step": 11790, + "task_loss": 0.17872558534145355 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.42615583539009094, + "epoch": 9.97, + "learning_rate": 1.8315018315018315e-07, + "loss": 0.378, + "step": 11791, + "task_loss": 0.5046189427375793 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4162856340408325, + "epoch": 9.97, + "learning_rate": 1.7845402460787076e-07, + "loss": 0.4501, + "step": 11792, + "task_loss": 0.10605460405349731 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5327898263931274, + "epoch": 9.97, + "learning_rate": 1.7375786606555837e-07, + "loss": 0.4982, + "step": 11793, + "task_loss": 0.27749985456466675 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.48357489705085754, + "epoch": 9.97, + "learning_rate": 1.6906170752324598e-07, + "loss": 0.5904, + "step": 11794, + "task_loss": 0.49925893545150757 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.534135103225708, + "epoch": 9.97, + "learning_rate": 1.6436554898093362e-07, + "loss": 0.5732, + "step": 11795, + "task_loss": 1.051812767982483 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3636243939399719, + "epoch": 9.97, + "learning_rate": 1.5966939043862123e-07, + "loss": 0.5997, + "step": 11796, + "task_loss": 0.8055181503295898 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.27140694856643677, + "epoch": 9.97, + "learning_rate": 1.5497323189630884e-07, + "loss": 0.3725, + "step": 11797, + "task_loss": 0.6648226976394653 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7842380404472351, + "epoch": 9.97, + "learning_rate": 1.5027707335399645e-07, + "loss": 0.6314, + "step": 11798, + "task_loss": 0.7309542894363403 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.478877454996109, + "epoch": 9.97, + "learning_rate": 1.4558091481168406e-07, + "loss": 0.5019, + "step": 11799, + "task_loss": 1.2602530717849731 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.42598360776901245, + "epoch": 9.97, + "learning_rate": 1.4088475626937164e-07, + "loss": 0.4346, + "step": 11800, + "task_loss": 0.2025788426399231 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.44479840993881226, + "epoch": 9.97, + "learning_rate": 1.3618859772705928e-07, + "loss": 0.401, + "step": 11801, + "task_loss": 0.42690378427505493 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3598289489746094, + "epoch": 9.98, + "learning_rate": 1.314924391847469e-07, + "loss": 0.3704, + "step": 11802, + "task_loss": 0.6306514739990234 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4371466040611267, + "epoch": 9.98, + "learning_rate": 1.267962806424345e-07, + "loss": 0.4133, + "step": 11803, + "task_loss": 0.977277398109436 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5131658315658569, + "epoch": 9.98, + "learning_rate": 1.221001221001221e-07, + "loss": 0.4065, + "step": 11804, + "task_loss": 0.47720029950141907 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4955328702926636, + "epoch": 9.98, + "learning_rate": 1.1740396355780972e-07, + "loss": 0.4955, + "step": 11805, + "task_loss": 0.4314495325088501 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.448905348777771, + "epoch": 9.98, + "learning_rate": 1.1270780501549733e-07, + "loss": 0.4611, + "step": 11806, + "task_loss": 0.334781676530838 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3976094722747803, + "epoch": 9.98, + "learning_rate": 1.0801164647318492e-07, + "loss": 0.4589, + "step": 11807, + "task_loss": 0.614673376083374 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7028859853744507, + "epoch": 9.98, + "learning_rate": 1.0331548793087256e-07, + "loss": 0.5392, + "step": 11808, + "task_loss": 0.4435502588748932 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4882431626319885, + "epoch": 9.98, + "learning_rate": 9.861932938856017e-08, + "loss": 0.4171, + "step": 11809, + "task_loss": 0.5074236989021301 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8158570528030396, + "epoch": 9.98, + "learning_rate": 9.392317084624777e-08, + "loss": 0.6792, + "step": 11810, + "task_loss": 1.2517924308776855 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8543037176132202, + "epoch": 9.98, + "learning_rate": 8.922701230393538e-08, + "loss": 0.6036, + "step": 11811, + "task_loss": 1.1480350494384766 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.50777667760849, + "epoch": 9.98, + "learning_rate": 8.453085376162299e-08, + "loss": 0.5674, + "step": 11812, + "task_loss": 0.4583331048488617 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.274361252784729, + "epoch": 9.99, + "learning_rate": 7.983469521931061e-08, + "loss": 0.4557, + "step": 11813, + "task_loss": 0.398992121219635 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4127875864505768, + "epoch": 9.99, + "learning_rate": 7.513853667699822e-08, + "loss": 0.5221, + "step": 11814, + "task_loss": 0.43059873580932617 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4107823669910431, + "epoch": 9.99, + "learning_rate": 7.044237813468582e-08, + "loss": 0.5331, + "step": 11815, + "task_loss": 0.3799629807472229 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.810592532157898, + "epoch": 9.99, + "learning_rate": 6.574621959237344e-08, + "loss": 0.6049, + "step": 11816, + "task_loss": 1.771484375 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.48412024974823, + "epoch": 9.99, + "learning_rate": 6.105006105006105e-08, + "loss": 0.4849, + "step": 11817, + "task_loss": 1.0037184953689575 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3843819797039032, + "epoch": 9.99, + "learning_rate": 5.6353902507748664e-08, + "loss": 0.4758, + "step": 11818, + "task_loss": 0.6112626791000366 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.3779985010623932, + "epoch": 9.99, + "learning_rate": 5.165774396543628e-08, + "loss": 0.3602, + "step": 11819, + "task_loss": 0.03185408189892769 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6673596501350403, + "epoch": 9.99, + "learning_rate": 4.6961585423123884e-08, + "loss": 0.5006, + "step": 11820, + "task_loss": 1.4893971681594849 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.485041081905365, + "epoch": 9.99, + "learning_rate": 4.2265426880811495e-08, + "loss": 0.4828, + "step": 11821, + "task_loss": 0.6400530934333801 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.4484560191631317, + "epoch": 9.99, + "learning_rate": 3.756926833849911e-08, + "loss": 0.4141, + "step": 11822, + "task_loss": 0.8475421667098999 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.1720668524503708, + "epoch": 9.99, + "learning_rate": 3.287310979618672e-08, + "loss": 0.4459, + "step": 11823, + "task_loss": 0.05403786897659302 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.5080179572105408, + "epoch": 9.99, + "learning_rate": 2.8176951253874332e-08, + "loss": 0.5987, + "step": 11824, + "task_loss": 0.8450747132301331 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.2977520227432251, + "epoch": 10.0, + "learning_rate": 2.3480792711561942e-08, + "loss": 0.4515, + "step": 11825, + "task_loss": 0.07201095670461655 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.7180753946304321, + "epoch": 10.0, + "learning_rate": 1.8784634169249556e-08, + "loss": 0.6341, + "step": 11826, + "task_loss": 0.730100691318512 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.34227943420410156, + "epoch": 10.0, + "learning_rate": 1.4088475626937166e-08, + "loss": 0.5517, + "step": 11827, + "task_loss": 1.168721318244934 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.8317971229553223, + "epoch": 10.0, + "learning_rate": 9.392317084624778e-09, + "loss": 0.5132, + "step": 11828, + "task_loss": 0.9987679123878479 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.6715251207351685, + "epoch": 10.0, + "learning_rate": 4.696158542312389e-09, + "loss": 0.5417, + "step": 11829, + "task_loss": 0.7848132252693176 + }, + { + "compression/movement_sparsity/importance_regularization_factor": 1.0, + "compression/movement_sparsity/importance_threshold": 0.0, + "compression/movement_sparsity/linear_layer_sparsity": 0.40466940388319583, + "compression/movement_sparsity/model_sparsity": 0.39076776495110943, + "compression_loss": 0.0, + "distillation_loss": 0.2879047989845276, + "epoch": 10.0, + "learning_rate": 0.0, + "loss": 0.4832, + "step": 11830, + "task_loss": 1.1391990184783936 + }, + { + "epoch": 10.0, + "step": 11830, + "total_flos": 5.9664632082415714e+19, + "train_loss": 25.706991044825553, + "train_runtime": 44544.7439, + "train_samples_per_second": 17.005, + "train_steps_per_second": 0.266 + } + ], + "max_steps": 11830, + "num_train_epochs": 10, + "total_flos": 5.9664632082415714e+19, + "trial_name": null, + "trial_params": null +}